From 094e47e980b0796b94b1b8f51f462a64d246e516 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 9 Mar 2018 21:05:20 -0500 Subject: [PATCH] 9102 zfs should be able to initialize storage devices Reviewed by: John Wren Kennedy Reviewed by: Matthew Ahrens Reviewed by: Pavel Zakharov Reviewed by: Prakash Surya Approved by: Richard Lowe --- usr/src/cmd/truss/codes.c | 2 + usr/src/cmd/zpool/zpool_main.c | 155 ++++ usr/src/cmd/ztest/ztest.c | 96 ++- usr/src/lib/libzfs/common/libzfs.h | 5 + usr/src/lib/libzfs/common/libzfs_pool.c | 94 +++ usr/src/lib/libzfs/common/libzfs_util.c | 7 + usr/src/lib/libzfs/common/mapfile-vers | 1 + usr/src/lib/libzfs_core/common/libzfs_core.c | 37 + usr/src/lib/libzfs_core/common/libzfs_core.h | 4 + usr/src/lib/libzfs_core/common/mapfile-vers | 7 + usr/src/lib/libzpool/common/llib-lzpool | 1 + usr/src/man/man1m/zpool.1m | 31 + usr/src/pkg/manifests/system-test-zfstest.mf | 39 + usr/src/test/zfs-tests/include/commands.cfg | 1 + usr/src/test/zfs-tests/runfiles/delphix.run | 13 + .../functional/cli_root/zpool_initialize/Makefile | 21 + .../cli_root/zpool_initialize/cleanup.ksh | 31 + .../zpool_initialize/zpool_initialize.kshlib | 43 ++ .../zpool_initialize_attach_detach_add_remove.ksh | 68 ++ .../zpool_initialize_import_export.ksh | 78 ++ ...ool_initialize_offline_export_import_online.ksh | 66 ++ .../zpool_initialize_online_offline.ksh | 74 ++ .../zpool_initialize/zpool_initialize_split.ksh | 64 ++ .../zpool_initialize_start_and_cancel_neg.ksh | 60 ++ .../zpool_initialize_start_and_cancel_pos.ksh | 52 ++ .../zpool_initialize_suspend_resume.ksh | 63 ++ .../zpool_initialize_unsupported_vdevs.ksh | 74 ++ .../zpool_initialize_verify_checksums.ksh | 59 ++ .../zpool_initialize_verify_initialized.ksh | 88 +++ usr/src/uts/common/Makefile.files | 1 + usr/src/uts/common/fs/zfs/metaslab.c | 23 +- usr/src/uts/common/fs/zfs/spa.c | 158 +++- usr/src/uts/common/fs/zfs/spa_misc.c | 7 + usr/src/uts/common/fs/zfs/sys/metaslab_impl.h | 10 +- usr/src/uts/common/fs/zfs/sys/spa.h | 2 + usr/src/uts/common/fs/zfs/sys/vdev_impl.h | 31 + usr/src/uts/common/fs/zfs/sys/vdev_initialize.h | 46 ++ usr/src/uts/common/fs/zfs/sys/zio_priority.h | 3 +- usr/src/uts/common/fs/zfs/vdev.c | 44 +- usr/src/uts/common/fs/zfs/vdev_disk.c | 1 + usr/src/uts/common/fs/zfs/vdev_file.c | 4 +- usr/src/uts/common/fs/zfs/vdev_indirect.c | 1 + usr/src/uts/common/fs/zfs/vdev_initialize.c | 791 +++++++++++++++++++++ usr/src/uts/common/fs/zfs/vdev_mirror.c | 3 + usr/src/uts/common/fs/zfs/vdev_missing.c | 4 +- usr/src/uts/common/fs/zfs/vdev_queue.c | 16 +- usr/src/uts/common/fs/zfs/vdev_raidz.c | 75 ++ usr/src/uts/common/fs/zfs/vdev_removal.c | 13 + usr/src/uts/common/fs/zfs/vdev_root.c | 3 +- usr/src/uts/common/fs/zfs/zfs_ioctl.c | 80 +++ usr/src/uts/common/sys/fs/zfs.h | 36 + 51 files changed, 2659 insertions(+), 27 deletions(-) create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh create mode 100644 usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh create mode 100644 usr/src/uts/common/fs/zfs/sys/vdev_initialize.h create mode 100644 usr/src/uts/common/fs/zfs/vdev_initialize.c diff --git a/usr/src/cmd/truss/codes.c b/usr/src/cmd/truss/codes.c index 7c510311f9..1e384f2f10 100644 --- a/usr/src/cmd/truss/codes.c +++ b/usr/src/cmd/truss/codes.c @@ -1288,6 +1288,8 @@ const struct ioc { "zfs_cmd_t" }, { (uint_t)ZFS_IOC_CHANNEL_PROGRAM, "ZFS_IOC_CHANNEL_PROGRAM", "zfs_cmd_t" }, + { (uint_t)ZFS_IOC_POOL_INITIALIZE, "ZFS_IOC_POOL_INITIALIZE", + "zfs_cmd_t" }, /* kssl ioctls */ { (uint_t)KSSL_ADD_ENTRY, "KSSL_ADD_ENTRY", diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index 5f385c6102..31234e6207 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -84,6 +84,7 @@ static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); static int zpool_do_split(int, char **); +static int zpool_do_initialize(int, char **); static int zpool_do_scrub(int, char **); static int zpool_do_import(int, char **); @@ -133,6 +134,7 @@ typedef enum { HELP_ONLINE, HELP_REPLACE, HELP_REMOVE, + HELP_INITIALIZE, HELP_SCRUB, HELP_STATUS, HELP_UPGRADE, @@ -184,6 +186,7 @@ static zpool_command_t command_table[] = { { "replace", zpool_do_replace, HELP_REPLACE }, { "split", zpool_do_split, HELP_SPLIT }, { NULL }, + { "initialize", zpool_do_initialize, HELP_INITIALIZE }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { NULL }, { "import", zpool_do_import, HELP_IMPORT }, @@ -257,6 +260,8 @@ get_usage(zpool_help_t idx) return (gettext("\tremove [-nps] ...\n")); case HELP_REOPEN: return (gettext("\treopen \n")); + case HELP_INITIALIZE: + return (gettext("\tinitialize [-cs] [ ...]\n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] ...\n")); case HELP_STATUS: @@ -1589,6 +1594,43 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, "resilvering" : "repairing"); } + if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || + vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && + !vs->vs_scan_removing) { + char zbuf[1024]; + char tbuf[256]; + struct tm zaction_ts; + + time_t t = vs->vs_initialize_action_time; + int initialize_pct = 100; + if (vs->vs_initialize_state != VDEV_INITIALIZE_COMPLETE) { + initialize_pct = (vs->vs_initialize_bytes_done * 100 / + (vs->vs_initialize_bytes_est + 1)); + } + + (void) localtime_r(&t, &zaction_ts); + (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + + switch (vs->vs_initialize_state) { + case VDEV_INITIALIZE_SUSPENDED: + (void) snprintf(zbuf, sizeof (zbuf), + ", suspended, started at %s", tbuf); + break; + case VDEV_INITIALIZE_ACTIVE: + (void) snprintf(zbuf, sizeof (zbuf), + ", started at %s", tbuf); + break; + case VDEV_INITIALIZE_COMPLETE: + (void) snprintf(zbuf, sizeof (zbuf), + ", completed at %s", tbuf); + break; + } + + (void) printf(gettext(" (%d%% initialized%s)"), + initialize_pct, zbuf); + } + (void) printf("\n"); for (c = 0; c < children; c++) { @@ -4165,6 +4207,119 @@ zpool_do_scrub(int argc, char **argv) return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); } +static void +zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) +{ + uint_t children = 0; + nvlist_t **child; + uint_t i; + + (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + if (children == 0) { + char *path = zpool_vdev_name(g_zfs, zhp, nvroot, B_FALSE); + fnvlist_add_boolean(res, path); + free(path); + return; + } + + for (i = 0; i < children; i++) { + zpool_collect_leaves(zhp, child[i], res); + } +} + +/* + * zpool initialize [-cs] [ ...] + * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool + * if none specified. + * + * -c Cancel. Ends active initializing. + * -s Suspend. Initializing can then be restarted with no flags. + */ +int +zpool_do_initialize(int argc, char **argv) +{ + int c; + char *poolname; + zpool_handle_t *zhp; + nvlist_t *vdevs; + int err = 0; + + struct option long_options[] = { + {"cancel", no_argument, NULL, 'c'}, + {"suspend", no_argument, NULL, 's'}, + {0, 0, 0, 0} + }; + + pool_initialize_func_t cmd_type = POOL_INITIALIZE_DO; + while ((c = getopt_long(argc, argv, "cs", long_options, NULL)) != -1) { + switch (c) { + case 'c': + if (cmd_type != POOL_INITIALIZE_DO) { + (void) fprintf(stderr, gettext("-c cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_INITIALIZE_CANCEL; + break; + case 's': + if (cmd_type != POOL_INITIALIZE_DO) { + (void) fprintf(stderr, gettext("-s cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_INITIALIZE_SUSPEND; + break; + case '?': + if (optopt != 0) { + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("invalid option '%s'\n"), + argv[optind - 1]); + } + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + return (-1); + } + + poolname = argv[0]; + zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + + vdevs = fnvlist_alloc(); + if (argc == 1) { + /* no individual leaf vdevs specified, so add them all */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + zpool_collect_leaves(zhp, nvroot, vdevs); + } else { + int i; + for (i = 1; i < argc; i++) { + fnvlist_add_boolean(vdevs, argv[i]); + } + } + + err = zpool_initialize(zhp, cmd_type, vdevs); + + fnvlist_free(vdevs); + zpool_close(zhp); + + return (err); +} + typedef struct status_cbdata { int cb_count; boolean_t cb_allpools; diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index dab209f157..c10186aa38 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -103,6 +103,7 @@ #include #include #include +#include #include #include #include @@ -346,6 +347,7 @@ ztest_func_t ztest_spa_upgrade; ztest_func_t ztest_device_removal; ztest_func_t ztest_remap_blocks; ztest_func_t ztest_spa_checkpoint_create_discard; +ztest_func_t ztest_initialize; uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -389,7 +391,8 @@ ztest_info_t ztest_info[] = { &ztest_opts.zo_vdevtime }, { ztest_device_removal, 1, &zopt_sometimes }, { ztest_remap_blocks, 1, &zopt_sometimes }, - { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely } + { ztest_spa_checkpoint_create_discard, 1, &zopt_rarely }, + { ztest_initialize, 1, &zopt_sometimes } }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -5473,6 +5476,97 @@ ztest_spa_rename(ztest_ds_t *zd, uint64_t id) rw_exit(&ztest_name_lock); } +static vdev_t * +ztest_random_concrete_vdev_leaf(vdev_t *vd) +{ + if (vd == NULL) + return (NULL); + + if (vd->vdev_children == 0) + return (vd); + + vdev_t *eligible[vd->vdev_children]; + int eligible_idx = 0, i; + for (i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + if (cvd->vdev_top->vdev_removing) + continue; + if (cvd->vdev_children > 0 || + (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { + eligible[eligible_idx++] = cvd; + } + } + VERIFY(eligible_idx > 0); + + uint64_t child_no = ztest_random(eligible_idx); + return (ztest_random_concrete_vdev_leaf(eligible[child_no])); +} + +/* ARGSUSED */ +void +ztest_initialize(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + int error = 0; + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* Random leaf vdev */ + vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); + if (rand_vd == NULL) { + spa_config_exit(spa, SCL_VDEV, FTAG); + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * The random vdev we've selected may change as soon as we + * drop the spa_config_lock. We create local copies of things + * we're interested in. + */ + uint64_t guid = rand_vd->vdev_guid; + char *path = strdup(rand_vd->vdev_path); + boolean_t active = rand_vd->vdev_initialize_thread != NULL; + + zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); + spa_config_exit(spa, SCL_VDEV, FTAG); + + uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); + error = spa_vdev_initialize(spa, guid, cmd); + switch (cmd) { + case POOL_INITIALIZE_CANCEL: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Cancel initialize %s", path); + if (!active) + (void) printf(" failed (no initialize active)"); + (void) printf("\n"); + } + break; + case POOL_INITIALIZE_DO: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Start initialize %s", path); + if (active && error == 0) + (void) printf(" failed (already active)"); + else if (error != 0) + (void) printf(" failed (error %d)", error); + (void) printf("\n"); + } + break; + case POOL_INITIALIZE_SUSPEND: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Suspend initialize %s", path); + if (!active) + (void) printf(" failed (no initialize active)"); + (void) printf("\n"); + } + break; + } + free(path); + mutex_exit(&ztest_vdev_lock); +} + /* * Verify pool integrity by running zdb. */ diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h index 1ebaffe9d1..9dc2b02e14 100644 --- a/usr/src/lib/libzfs/common/libzfs.h +++ b/usr/src/lib/libzfs/common/libzfs.h @@ -136,6 +136,9 @@ typedef enum zfs_error { EZFS_NO_CHECKPOINT, /* pool has no checkpoint */ EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ + EZFS_TOOMANY, /* argument list too long */ + EZFS_INITIALIZING, /* currently initializing */ + EZFS_NO_INITIALIZE, /* no active initialize */ EZFS_UNKNOWN } zfs_error_t; @@ -260,6 +263,8 @@ typedef struct splitflags { * Functions to manipulate pool and vdev state */ extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); +extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, + nvlist_t *); extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); extern int zpool_reguid(zpool_handle_t *); extern int zpool_reopen(zpool_handle_t *); diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c index 2407e888e7..e5b3ce02dc 100644 --- a/usr/src/lib/libzfs/common/libzfs_pool.c +++ b/usr/src/lib/libzfs/common/libzfs_pool.c @@ -1970,6 +1970,100 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) } } +static int +xlate_init_err(int err) +{ + switch (err) { + case ENODEV: + return (EZFS_NODEVICE); + case EINVAL: + case EROFS: + return (EZFS_BADDEV); + case EBUSY: + return (EZFS_INITIALIZING); + case ESRCH: + return (EZFS_NO_INITIALIZE); + } + return (err); +} + +/* + * Begin, suspend, or cancel the initialization (initializing of all free + * blocks) for the given vdevs in the given pool. + */ +int +zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, + nvlist_t *vds) +{ + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + nvlist_t *errlist; + + /* translate vdev names to guids */ + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *guids_to_paths = fnvlist_alloc(); + boolean_t spare, cache; + nvlist_t *tgt; + nvpair_t *elem; + + for (elem = nvlist_next_nvpair(vds, NULL); elem != NULL; + elem = nvlist_next_nvpair(vds, elem)) { + char *vd_path = nvpair_name(elem); + tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, NULL); + + if ((tgt == NULL) || cache || spare) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot initialize '%s'"), + vd_path); + int err = (tgt == NULL) ? EZFS_NODEVICE : + (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE); + fnvlist_free(vdev_guids); + fnvlist_free(guids_to_paths); + return (zfs_error(hdl, err, msg)); + } + + uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); + fnvlist_add_uint64(vdev_guids, vd_path, guid); + + (void) snprintf(msg, sizeof (msg), "%llu", guid); + fnvlist_add_string(guids_to_paths, msg, vd_path); + } + + int err = lzc_initialize(zhp->zpool_name, cmd_type, vdev_guids, + &errlist); + fnvlist_free(vdev_guids); + + if (err == 0) { + fnvlist_free(guids_to_paths); + return (0); + } + + nvlist_t *vd_errlist = NULL; + if (errlist != NULL) { + vd_errlist = fnvlist_lookup_nvlist(errlist, + ZPOOL_INITIALIZE_VDEVS); + } + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "operation failed")); + + for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; + elem = nvlist_next_nvpair(vd_errlist, elem)) { + int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); + char *path = fnvlist_lookup_string(guids_to_paths, + nvpair_name(elem)); + (void) zfs_error_fmt(hdl, vd_error, "cannot initialize '%s'", + path); + } + + fnvlist_free(guids_to_paths); + if (vd_errlist != NULL) + return (-1); + + return (zpool_standard_error(hdl, err, msg)); +} + /* * This provides a very minimal check whether a given string is likely a * c#t#d# style string. Users of this are expected to do their own diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c index 3cf83eee9e..aa74189cc8 100644 --- a/usr/src/lib/libzfs/common/libzfs_util.c +++ b/usr/src/lib/libzfs/common/libzfs_util.c @@ -249,6 +249,13 @@ libzfs_error_description(libzfs_handle_t *hdl) return (dgettext(TEXT_DOMAIN, "device removal in progress")); case EZFS_VDEV_TOO_BIG: return (dgettext(TEXT_DOMAIN, "device exceeds supported size")); + case EZFS_TOOMANY: + return (dgettext(TEXT_DOMAIN, "argument list too long")); + case EZFS_INITIALIZING: + return (dgettext(TEXT_DOMAIN, "currently initializing")); + case EZFS_NO_INITIALIZE: + return (dgettext(TEXT_DOMAIN, "there is no active " + "initialization")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: diff --git a/usr/src/lib/libzfs/common/mapfile-vers b/usr/src/lib/libzfs/common/mapfile-vers index 5b7817304f..233fe39d51 100644 --- a/usr/src/lib/libzfs/common/mapfile-vers +++ b/usr/src/lib/libzfs/common/mapfile-vers @@ -215,6 +215,7 @@ SYMBOL_VERSION SUNWprivate_1.1 { zpool_import_props; zpool_import_status; zpool_in_use; + zpool_initialize; zpool_is_bootable; zpool_iter; zpool_label_disk; diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.c b/usr/src/lib/libzfs_core/common/libzfs_core.c index d09304fbbb..ac25820c61 100644 --- a/usr/src/lib/libzfs_core/common/libzfs_core.c +++ b/usr/src/lib/libzfs_core/common/libzfs_core.c @@ -1038,3 +1038,40 @@ lzc_channel_program_nosync(const char *pool, const char *program, return (lzc_channel_program_impl(pool, program, B_FALSE, timeout, memlimit, argnvl, outnvl)); } + +/* + * Changes initializing state. + * + * vdevs should be a list of (, guid) where guid is a uint64 vdev GUID. + * The key is ignored. + * + * If there are errors related to vdev arguments, per-vdev errors are returned + * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where + * guid is stringified with PRIu64, and errno is one of the following as + * an int64_t: + * - ENODEV if the device was not found + * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing) + * - EROFS if the device is not writeable + * - EBUSY start requested but the device is already being initialized + * - ESRCH cancel/suspend requested but device is not being initialized + * + * If the errlist is empty, then return value will be: + * - EINVAL if one or more arguments was invalid + * - Other spa_open failures + * - 0 if the operation succeeded + */ +int +lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type, + nvlist_t *vdevs, nvlist_t **errlist) +{ + int error; + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type); + fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs); + + error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist); + + fnvlist_free(args); + + return (error); +} diff --git a/usr/src/lib/libzfs_core/common/libzfs_core.h b/usr/src/lib/libzfs_core/common/libzfs_core.h index 8c6743f503..d4a9a49cc1 100644 --- a/usr/src/lib/libzfs_core/common/libzfs_core.h +++ b/usr/src/lib/libzfs_core/common/libzfs_core.h @@ -31,6 +31,8 @@ #include #include #include +#include + #ifdef __cplusplus extern "C" { @@ -56,6 +58,8 @@ int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **); int lzc_bookmark(nvlist_t *, nvlist_t **); int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **); int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); +int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *, + nvlist_t **); int lzc_snaprange_space(const char *, const char *, uint64_t *); diff --git a/usr/src/lib/libzfs_core/common/mapfile-vers b/usr/src/lib/libzfs_core/common/mapfile-vers index 7f63f041b6..588eb76e97 100644 --- a/usr/src/lib/libzfs_core/common/mapfile-vers +++ b/usr/src/lib/libzfs_core/common/mapfile-vers @@ -37,6 +37,13 @@ $mapfile_version 2 +SYMBOL_VERSION ILLUMOS_0.3 { + global: + + lzc_initialize; +} ILLUMOS_0.1; + + SYMBOL_VERSION ILLUMOS_0.2 { global: diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool index 9e52a46aee..43938533ca 100644 --- a/usr/src/lib/libzpool/common/llib-lzpool +++ b/usr/src/lib/libzpool/common/llib-lzpool @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include diff --git a/usr/src/man/man1m/zpool.1m b/usr/src/man/man1m/zpool.1m index cd074eaa1d..f51619f6f1 100644 --- a/usr/src/man/man1m/zpool.1m +++ b/usr/src/man/man1m/zpool.1m @@ -105,6 +105,11 @@ .Ar pool Ns | Ns Ar id .Op Ar newpool .Nm +.Cm initialize +.Op Fl cs +.Ar pool +.Op Ar device Ns ... +.Nm .Cm iostat .Op Fl v .Op Fl T Sy u Ns | Ns Sy d @@ -1332,6 +1337,32 @@ to fully rewind. .El .It Xo .Nm +.Cm initialize +.Op Fl cs +.Ar pool +.Op Ar device Ns ... +.Xc +Begins initializing by writing to all unallocated regions on the specified +devices, or all eligible devices in the pool if no individual devices are +specified. +Only leaf data or log devices may be initialized. +.Bl -tag -width Ds +.It Fl c, -cancel +Cancel initializing on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +initialized, the command will fail and no cancellation will occur on any device. +.It Fl s -suspend +Suspend initializing on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +initialized, the command will fail and no suspension will occur on any device. +Initializing can then be resumed by running +.Nm zpool Cm initialize +with no flags on the relevant target devices. +.El +.It Xo +.Nm .Cm iostat .Op Fl v .Op Fl T Sy u Ns | Ns Sy d diff --git a/usr/src/pkg/manifests/system-test-zfstest.mf b/usr/src/pkg/manifests/system-test-zfstest.mf index faf818c66f..47ed5a1f33 100644 --- a/usr/src/pkg/manifests/system-test-zfstest.mf +++ b/usr/src/pkg/manifests/system-test-zfstest.mf @@ -79,6 +79,7 @@ dir path=opt/zfs-tests/tests/functional/cli_root/zpool_get dir path=opt/zfs-tests/tests/functional/cli_root/zpool_history dir path=opt/zfs-tests/tests/functional/cli_root/zpool_import dir path=opt/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles +dir path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize dir path=opt/zfs-tests/tests/functional/cli_root/zpool_labelclear dir path=opt/zfs-tests/tests/functional/cli_root/zpool_offline dir path=opt/zfs-tests/tests/functional/cli_root/zpool_online @@ -1570,6 +1571,44 @@ file \ file \ path=opt/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_rename_001_pos \ mode=0555 +file path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib \ + mode=0444 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums \ + mode=0555 +file \ + path=opt/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized \ + mode=0555 file \ path=opt/zfs-tests/tests/functional/cli_root/zpool_labelclear/labelclear.cfg \ mode=0444 diff --git a/usr/src/test/zfs-tests/include/commands.cfg b/usr/src/test/zfs-tests/include/commands.cfg index b85717523a..c9097f0fc2 100644 --- a/usr/src/test/zfs-tests/include/commands.cfg +++ b/usr/src/test/zfs-tests/include/commands.cfg @@ -73,6 +73,7 @@ export USR_BIN_FILES='awk mpstat mv nawk + od pack pagesize pax diff --git a/usr/src/test/zfs-tests/runfiles/delphix.run b/usr/src/test/zfs-tests/runfiles/delphix.run index ff77d8f1f2..e37f606fe0 100644 --- a/usr/src/test/zfs-tests/runfiles/delphix.run +++ b/usr/src/test/zfs-tests/runfiles/delphix.run @@ -296,6 +296,19 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', tests = ['zpool_labelclear_active', 'zpool_labelclear_exported'] pre = post = +[/opt/zfs-tests/tests/functional/cli_root/zpool_initialize] +tests = ['zpool_initialize_attach_detach_add_remove', + 'zpool_initialize_import_export', + 'zpool_initialize_offline_export_import_online', + 'zpool_initialize_online_offline', + 'zpool_initialize_split', + 'zpool_initialize_start_and_cancel_neg', + 'zpool_initialize_start_and_cancel_pos', + 'zpool_initialize_suspend_resume', + 'zpool_initialize_unsupported_vdevs', + 'zpool_initialize_verify_checksums', + 'zpool_initialize_verify_initialized'] +pre = [/opt/zfs-tests/tests/functional/cli_root/zpool_offline] tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg'] diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile new file mode 100644 index 0000000000..36f8a12f70 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile @@ -0,0 +1,21 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +include $(SRC)/Makefile.master + +ROOTOPTPKG = $(ROOT)/opt/zfs-tests +TARGETDIR = $(ROOTOPTPKG)/tests/functional/cli_root/zpool_initialize + +include $(SRC)/test/zfs-tests/Makefile.com diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh new file mode 100644 index 0000000000..d9f9570f47 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh @@ -0,0 +1,31 @@ +#!/usr/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_cleanup diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib new file mode 100644 index 0000000000..0f4e7f0fa9 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib @@ -0,0 +1,43 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +function initialize_prog_line # pool disk +{ + typeset pool="$1" + typeset disk="$2" + zpool status "$pool" | grep "$disk" | grep "initialized" +} + +function initialize_progress # pool disk +{ + initialize_prog_line "$1" "$2" | \ + sed 's/.*(\([0-9]\{1,\}\)% initialized.*/\1/g' +} + +function cleanup +{ + if poolexists $TESTPOOL; then + log_must zpool destroy -f $TESTPOOL + fi + + if poolexists $TESTPOOL1; then + log_must zpool destroy -f $TESTPOOL1 + fi +} +log_onexit cleanup diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh new file mode 100644 index 0000000000..2a695025d2 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Detaching/attaching, adding/removing data devices works with initializing. +# +# STRATEGY: +# 1. Create a single-disk pool. +# 2. Start initializing. +# 3. Attach a second disk, ensure initializing continues. +# 4. Detach the second disk, ensure initializing continues. +# 5. Add a second disk, ensure initializing continues. +# 6. Remove the first disk, ensure initializing stops. +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" + +log_must zpool create -f $TESTPOOL $DISK1 + +log_must zpool initialize $TESTPOOL $DISK1 +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" + +log_must zpool attach $TESTPOOL $DISK1 $DISK2 +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ "$progress" -le "$new_progress" ]] || \ + log_fail "Lost initializing progress on demotion to child vdev" +progress="$new_progress" + +log_must zpool detach $TESTPOOL $DISK2 +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ "$progress" -le "$new_progress" ]] || \ + log_fail "Lost initializing progress on promotion to top vdev" +progress="$new_progress" + +log_must zpool add $TESTPOOL $DISK2 +log_must zpool remove $TESTPOOL $DISK1 +[[ -z "$(initialize_prog_line $TESTPOOL $DISK1)" ]] || \ + log_fail "Initializing continued after initiating removal" + +log_pass "Initializing worked as expected across attach/detach and add/remove" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh new file mode 100644 index 0000000000..386d2a5dc2 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Initializing automatically resumes across import/export. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Start initializing and verify that initializing is active. +# 3. Export the pool. +# 4. Import the pool. +# 5. Verify that initializing resumes and progress does not regress. +# 6. Suspend initializing. +# 7. Repeat steps 3-4. +# 8. Verify that progress does not regress but initializing is still suspended. +# + +DISK1=${DISKS%% *} + +log_must zpool create -f $TESTPOOL $DISK1 +log_must zpool initialize $TESTPOOL + +sleep 2 + +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$new_progress" ]] && log_fail "Initializing did not restart after import" +[[ "$progress" -le "$new_progress" ]] || \ + log_fail "Initializing lost progress after import" +log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_must zpool initialize -s $TESTPOOL $DISK1 +action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g')" +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g') +[[ "$action_date" != "$new_action_date" ]] && \ + log_fail "Initializing action date did not persist across export/import" + +[[ "$new_progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "Initializing lost progress after import" + +log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_pass "Initializing retains state as expected across export/import" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh new file mode 100644 index 0000000000..dedd466e4e --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Miscellaneous complex sequences of operations function as expected. +# +# STRATEGY: +# 1. Create a pool with a two-way mirror. +# 2. Start initializing, offline, export, import, online and verify that +# initializing state is preserved / initializing behaves as expected +# at each step. +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" + +log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 + +log_must zpool initialize $TESTPOOL $DISK1 +log_must zpool offline $TESTPOOL $DISK1 +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" +log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$new_progress" ]] && log_fail "Initializing did not start after import" +[[ "$new_progress" -ge "$progress" ]] || \ + log_fail "Initializing lost progress after import" +log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_must zpool online $TESTPOOL $DISK1 +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ "$new_progress" -ge "$progress" ]] || \ + log_fail "Initializing lost progress after online" + +log_pass "Initializing behaves as expected at each step of:" \ + "initialize + offline + export + import + online" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh new file mode 100644 index 0000000000..55bd3188c9 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Initializing automatically resumes across offline/online. +# +# STRATEGY: +# 1. Create a pool with a two-way mirror. +# 2. Start initializing one of the disks and verify that initializing is active. +# 3. Offline the disk. +# 4. Online the disk. +# 5. Verify that initializing resumes and progress does not regress. +# 6. Suspend initializing. +# 7. Repeat steps 3-4 and verify that initializing does not resume. +# + +DISK1=${DISKS%% *} +DISK2="$(echo $DISKS | cut -d' ' -f2)" + +log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 +log_must zpool initialize $TESTPOOL $DISK1 + +log_must zpool offline $TESTPOOL $DISK1 + +progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$progress" ]] && log_fail "Initializing did not start" + +log_must zpool online $TESTPOOL $DISK1 + +new_progress="$(initialize_progress $TESTPOOL $DISK1)" +[[ -z "$new_progress" ]] && \ + log_fail "Initializing did not restart after onlining" +[[ "$progress" -le "$new_progress" ]] || \ + log_fail "Initializing lost progress after onlining" +log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_must zpool initialize -s $TESTPOOL $DISK1 +action_date="$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g')" +log_must zpool offline $TESTPOOL $DISK1 +log_must zpool online $TESTPOOL $DISK1 +new_action_date=$(initialize_prog_line $TESTPOOL $DISK1 | \ + sed 's/.*ed at \(.*\)).*/\1/g') +[[ "$action_date" != "$new_action_date" ]] && \ + log_fail "Initializing action date did not persist across offline/online" +log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_pass "Initializing performs as expected across offline/online" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh new file mode 100644 index 0000000000..69b27c26c9 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Initializing state is preserved across zpool split. +# +# STRATEGY: +# 1. Create a pool with a two-way mirror. +# 2. Start initializing both devices. +# 3. Split the pool. Ensure initializing continues on the original. +# 4. Import the new pool. Ensure initializing resumes on it. +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" +POOL2="${TESTPOOL}_split" + +log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 + +log_must zpool initialize $TESTPOOL $DISK1 $DISK2 +orig_prog1="$(initialize_progress $TESTPOOL $DISK1)" +orig_prog2="$(initialize_progress $TESTPOOL $DISK2)" +[[ -z "$orig_prog1" ]] && log_fail "Initializing did not start" + +log_must zpool split $TESTPOOL $TESTPOOL1 $DISK2 + +# Ensure initializing continued as expected on the original pool. +[[ "$(initialize_progress $TESTPOOL $DISK1)" -ge "$orig_prog1" ]] || \ + log_fail "Initializing lost progress on original pool" +log_mustnot eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" + +log_must zpool import $TESTPOOL1 + +[[ "$(initialize_progress $TESTPOOL1 $DISK2)" -ge "$orig_prog2" ]] || \ + log_fail "Initializing lost progress on split pool" +log_mustnot eval "initialize_prog_line $TESTPOOL1 $DISK1 | grep suspended" + +log_pass "Initializing behaves as expected on zpool split" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh new file mode 100644 index 0000000000..59b266d321 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh @@ -0,0 +1,60 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Cancelling and suspending initialize doesn't work if not all specified vdevs +# are being initialized. +# +# STRATEGY: +# 1. Create a three-disk pool. +# 2. Start initializing and verify that initializing is active. +# 3. Try to cancel and suspend initializing on the non-initializing disks. +# 4. Try to re-initialize the currently initializing disk. +# + +DISK1=${DISKS%% *} +DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" + +log_must zpool list -v +log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 +log_must zpool initialize $TESTPOOL $DISK1 + +[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initialize did not start" + +log_mustnot zpool initialize -c $TESTPOOL $DISK2 +log_mustnot zpool initialize -c $TESTPOOL $DISK2 $DISK3 + +log_mustnot zpool initialize -s $TESTPOOL $DISK2 +log_mustnot zpool initialize -s $TESTPOOL $DISK2 $DISK3 + +log_mustnot zpool initialize $TESTPOOL $DISK1 + +log_pass "Nonsensical initialize operations fail" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh new file mode 100644 index 0000000000..5003b5f10b --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Starting and stopping an initialize works. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Start initializing and verify that initializing is active. +# 3. Cancel initializing and verify that initializing is not active. +# + +DISK1=${DISKS%% *} + +log_must zpool create -f $TESTPOOL $DISK1 +log_must zpool initialize $TESTPOOL + +[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initialize did not start" + +log_must zpool initialize -c $TESTPOOL + +[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "Initialize did not stop" + +log_pass "Initialize start + cancel works" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh new file mode 100644 index 0000000000..bce3da5267 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Suspending and resuming initializing works. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Start initializing and verify that initializing is active. +# 3. Wait 3 seconds, then suspend initializing and verify that the progress +# reporting says so. +# 4. Wait 5 seconds and ensure initializing progress doesn't advance. +# 5. Restart initializing and verify that the progress doesn't regress. +# + +DISK1=${DISKS%% *} + +log_must zpool create -f $TESTPOOL $DISK1 +log_must zpool initialize $TESTPOOL + +[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initializing did not start" + +sleep 5 +log_must zpool initialize -s $TESTPOOL +log_must eval "initialize_prog_line $TESTPOOL $DISK1 | grep suspended" +progress="$(initialize_progress $TESTPOOL $DISK1)" + +sleep 3 +[[ "$progress" -eq "$(initialize_progress $TESTPOOL $DISK1)" ]] || \ + log_fail "Initializing progress advanced while suspended" + +log_must zpool initialize $TESTPOOL $DISK1 +[[ "$progress" -le "$(initialize_progress $TESTPOOL $DISK1)" ]] || + log_fail "Initializing progress regressed after resuming" + +log_pass "Suspend + resume initializing works as expected" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh new file mode 100644 index 0000000000..bd4ca069c4 --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh @@ -0,0 +1,74 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Attempting to initialize unsupported vdevs should fail. +# +# STRATEGY: +# 1. Create a pool with the following configuration: +# root +# mirror +# vdev0 +# vdev1 (offline) +# cache +# vdev2 +# spare +# vdev3 +# 2. Try to initialize vdev1, vdev2, and vdev3. Ensure that all 3 fail. +# +function cleanup +{ + if datasetexists $TESTPOOL; then + log_must zpool destroy -f $TESTPOOL + fi + if [[ -d $TESTDIR ]]; then + log_must rm -rf $TESTDIR + fi +} +log_onexit cleanup + +log_must mkdir $TESTDIR +set -A FDISKS +for n in {0..2}; do + log_must mkfile $MINVDEVSIZE $TESTDIR/vdev$n + FDISKS+=("$TESTDIR/vdev$n") +done +FDISKS+=("${DISKS%% *}") + +log_must zpool create $TESTPOOL mirror ${FDISKS[0]} ${FDISKS[1]} \ + spare ${FDISKS[2]} cache ${FDISKS[3]} + +log_must zpool offline $TESTPOOL ${FDISKS[1]} + +log_mustnot zpool initialize $TESTPOOL mirror-0 +for n in {1..3}; do + log_mustnot zpool initialize $TESTPOOL ${FDISKS[$n]} +done + +log_pass "Attempting to initialize failed on unsupported devices" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh new file mode 100644 index 0000000000..6cc82b9baa --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh @@ -0,0 +1,59 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Initializing does not cause file corruption. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Write data to the pool. +# 3. Start initializing and verify that initializing is active. +# 4. Write more data to the pool. +# 5. Run zdb to validate checksums. +# + +DISK1=${DISKS%% *} + +log_must zpool create -f $TESTPOOL $DISK1 +log_must /usr/bin/dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1M count=30 +log_must sync + +log_must zpool initialize $TESTPOOL + +log_must zdb -cc $TESTPOOL + +[[ -z "$(initialize_progress $TESTPOOL $DISK1)" ]] && \ + log_fail "Initializing did not start" + +log_must /usr/bin/dd if=/dev/urandom of=/$TESTPOOL/file2 bs=1M count=30 +log_must sync + +log_must zdb -cc $TESTPOOL + +log_pass "Initializing does not corrupt existing or new data" diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh new file mode 100644 index 0000000000..8d20e13e0a --- /dev/null +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh @@ -0,0 +1,88 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# After initializing, the disk is actually initialized. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Initialize the disk to completion. +# 3. Load all metaslabs that don't have a spacemap, and make sure the entire +# metaslab has been filled with the initializing pattern (deadbeef). +# + +function cleanup +{ + mdb -kwe "zfs_initialize_value/Z $ORIG_PATTERN" + zpool import -d $TESTDIR $TESTPOOL + + if datasetexists $TESTPOOL ; then + zpool destroy -f $TESTPOOL + fi + if [[ -d "$TESTDIR" ]]; then + rm -rf "$TESTDIR" + fi +} +log_onexit cleanup +PATTERN="deadbeefdeadbeef" +SMALLFILE="$TESTDIR/smallfile" + +ORIG_PATTERN=$(mdb -ke "zfs_initialize_value/J" | tail -1 | awk '{print $NF}') +log_must mdb -kwe "zfs_initialize_value/Z $PATTERN" + +log_must mkdir "$TESTDIR" +log_must mkfile $MINVDEVSIZE "$SMALLFILE" +log_must zpool create $TESTPOOL "$SMALLFILE" +log_must zpool initialize $TESTPOOL + +while [[ "$(initialize_progress $TESTPOOL $SMALLFILE)" -lt "100" ]]; do + sleep 0.5 +done + +log_must zpool export $TESTPOOL + +spacemaps=0 +bs=512 +while read -r sm; do + typeset offset="$(echo $sm | cut -d ' ' -f1)" + typeset size="$(echo $sm | cut -d ' ' -f2)" + + spacemaps=$((spacemaps + 1)) + offset=$(((4 * 1024 * 1024) + 16#$offset)) + out=$(dd if=$SMALLFILE skip=$(($offset / $bs)) \ + count=$(($size / $bs)) bs=$bs 2>/dev/null | od -t x8 -Ad) + echo "$out" | log_must egrep "$PATTERN|\*|$size" +done <<< "$(zdb -p $TESTDIR -Pme $TESTPOOL | egrep 'spacemap[ ]+0 ' | \ + awk '{print $4, $8}')" + +if [[ $spacemaps -eq 0 ]];then + log_fail "Did not find any empty space maps to check" +else + log_pass "Initializing wrote appropriate amount to disk" +fi diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index a78ec9e47c..20326469ed 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1439,6 +1439,7 @@ ZFS_COMMON_OBJS += \ vdev_indirect.o \ vdev_indirect_births.o \ vdev_indirect_mapping.o \ + vdev_initialize.o \ vdev_label.o \ vdev_mirror.o \ vdev_missing.o \ diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index d8cdccf912..b965654872 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -635,6 +635,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL); mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *), KM_SLEEP); mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *), @@ -681,6 +683,8 @@ metaslab_group_destroy(metaslab_group_t *mg) kmem_free(mg->mg_secondaries, mg->mg_allocators * sizeof (metaslab_t *)); mutex_destroy(&mg->mg_lock); + mutex_destroy(&mg->mg_ms_initialize_lock); + cv_destroy(&mg->mg_ms_initialize_cv); for (int i = 0; i < mg->mg_allocators; i++) { refcount_destroy(&mg->mg_alloc_queue_depth[i]); @@ -1541,6 +1545,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + ms->ms_id = id; ms->ms_start = id << vd->vdev_ms_shift; ms->ms_size = 1ULL << vd->vdev_ms_shift; @@ -2717,6 +2722,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * from it in 'metaslab_unload_delay' txgs, then unload it. */ if (msp->ms_loaded && + msp->ms_initializing == 0 && msp->ms_selected_txg + metaslab_unload_delay < txg) { for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( @@ -2966,6 +2972,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) metaslab_class_t *mc = msp->ms_group->mg_class; VERIFY(!msp->ms_condensing); + VERIFY0(msp->ms_initializing); start = mc->mc_ops->msop_alloc(msp, size); if (start != -1ULL) { @@ -3026,9 +3033,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, } /* - * If the selected metaslab is condensing, skip it. + * If the selected metaslab is condensing or being + * initialized, skip it. */ - if (msp->ms_condensing) + if (msp->ms_condensing || msp->ms_initializing > 0) continue; *was_active = msp->ms_allocator != -1; @@ -3193,7 +3201,9 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, /* * If this metaslab is currently condensing then pick again as * we can't manipulate this metaslab until it's committed - * to disk. + * to disk. If this metaslab is being initialized, we shouldn't + * allocate from it since the allocated region might be + * overwritten after allocation. */ if (msp->ms_condensing) { metaslab_trace_add(zal, mg, msp, asize, d, @@ -3202,6 +3212,13 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); continue; + } else if (msp->ms_initializing > 0) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_INITIALIZING, allocator); + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + mutex_exit(&msp->ms_lock); + continue; } offset = metaslab_block_alloc(msp, asize, txg); diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 9185c5b182..b71710bbd7 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -413,8 +414,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); - if (err = dsl_dataset_hold_obj(dp, - za.za_first_integer, FTAG, &ds)) { + err = dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &ds); + if (err != 0) { dsl_pool_config_exit(dp, FTAG); break; } @@ -569,7 +571,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) break; } - if (error = dmu_objset_hold(strval, FTAG, &os)) + error = dmu_objset_hold(strval, FTAG, &os); + if (error != 0) break; /* @@ -1155,8 +1158,10 @@ spa_activate(spa_t *spa, int mode) spa_create_zio_taskqs(spa); } - for (size_t i = 0; i < TXG_SIZE; i++) - spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0); + for (size_t i = 0; i < TXG_SIZE; i++) { + spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); @@ -1315,6 +1320,11 @@ spa_unload(spa_t *spa) */ spa_async_suspend(spa); + if (spa->spa_root_vdev) { + vdev_initialize_stop_all(spa->spa_root_vdev, + VDEV_INITIALIZE_ACTIVE); + } + /* * Stop syncing. */ @@ -1330,10 +1340,10 @@ spa_unload(spa_t *spa) * calling taskq_wait(mg_taskq). */ if (spa->spa_root_vdev != NULL) { - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]); - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_exit(spa, SCL_ALL, spa); } /* @@ -1367,7 +1377,7 @@ spa_unload(spa_t *spa) bpobj_close(&spa->spa_deferred_bpobj); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); /* * Close all vdevs. @@ -1429,7 +1439,7 @@ spa_unload(spa_t *spa) spa->spa_comment = NULL; } - spa_config_exit(spa, SCL_ALL, FTAG); + spa_config_exit(spa, SCL_ALL, spa); } /* @@ -3866,6 +3876,10 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) spa_restart_removal(spa); spa_spawn_aux_threads(spa); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_load_note(spa, "LOADED"); @@ -5347,6 +5361,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, * in which case we can modify its state. */ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { + /* * Objsets may be open only because they're dirty, so we * have to force it to sync before checking spa_refcnt. @@ -5381,6 +5396,18 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, } /* + * We're about to export or destroy this pool. Make sure + * we stop all initializtion activity here before we + * set the spa_final_txg. This will ensure that all + * dirty data resulting from the initialization is + * committed to disk before we unload the pool. + */ + if (spa->spa_root_vdev != NULL) { + vdev_initialize_stop_all(spa->spa_root_vdev, + VDEV_INITIALIZE_ACTIVE); + } + + /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. @@ -6070,6 +6097,86 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) return (error); } +int +spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type) +{ + /* + * We hold the namespace lock through the whole function + * to prevent any changes to the pool while we're starting or + * stopping initialization. The config and state locks are held so that + * we can properly assess the vdev state before we commit to + * the initializing operation. + */ + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + /* Look up vdev and ensure it's a leaf. */ + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_detached) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENODEV)); + } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EINVAL)); + } else if (!vdev_writeable(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EROFS)); + } + mutex_enter(&vd->vdev_initialize_lock); + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + + /* + * When we activate an initialize action we check to see + * if the vdev_initialize_thread is NULL. We do this instead + * of using the vdev_initialize_state since there might be + * a previous initialization process which has completed but + * the thread is not exited. + */ + if (cmd_type == POOL_INITIALIZE_DO && + (vd->vdev_initialize_thread != NULL || + vd->vdev_top->vdev_removing)) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EBUSY)); + } else if (cmd_type == POOL_INITIALIZE_CANCEL && + (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && + vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ESRCH)); + } else if (cmd_type == POOL_INITIALIZE_SUSPEND && + vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { + mutex_exit(&vd->vdev_initialize_lock); + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ESRCH)); + } + + switch (cmd_type) { + case POOL_INITIALIZE_DO: + vdev_initialize(vd); + break; + case POOL_INITIALIZE_CANCEL: + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); + break; + case POOL_INITIALIZE_SUSPEND: + vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED); + break; + default: + panic("invalid cmd_type %llu", (unsigned long long)cmd_type); + } + mutex_exit(&vd->vdev_initialize_lock); + + /* Sync out the initializing state */ + txg_wait_synced(spa->spa_dsl_pool, 0); + mutex_exit(&spa_namespace_lock); + + return (0); +} + + /* * Split a set of devices from their mirrors, and create a new pool from them. */ @@ -6277,6 +6384,19 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + /* + * Temporarily stop the initializing activity. We set + * the state to ACTIVE so that we know to resume + * the initializing once the split has completed. + */ + mutex_enter(&vml[c]->vdev_initialize_lock); + vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE); + mutex_exit(&vml[c]->vdev_initialize_lock); + } + } + newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; /* create the new pool from the disks of the original pool */ @@ -6364,6 +6484,10 @@ out: if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } + + /* restart initializing disks as necessary */ + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); @@ -6739,6 +6863,14 @@ spa_async_thread(void *arg) if (tasks & SPA_ASYNC_RESILVER) dsl_resilver_restart(spa->spa_dsl_pool, 0); + if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + /* * Let the world know that we're done. */ @@ -7384,8 +7516,9 @@ spa_sync(spa_t *spa, uint64_t txg) * Wait for i/os issued in open context that need to complete * before this txg syncs. */ - VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK])); - spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0); + (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); + spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); /* * Lock out configuration changes. @@ -7674,7 +7807,8 @@ spa_sync(spa_t *spa, uint64_t txg) /* * Update usable space statistics. */ - while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + != NULL) vdev_sync_done(vd, txg); spa_update_dspace(spa); diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 8a348af53c..41342f37ea 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -1196,6 +1197,12 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); + if (vd->vdev_ops->vdev_op_leaf) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED); + mutex_exit(&vd->vdev_initialize_lock); + } + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_ALL, spa); diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h index 6a02f7c800..3c4ce37303 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h @@ -68,7 +68,8 @@ typedef enum trace_alloc_type { TRACE_GROUP_FAILURE = -5ULL, TRACE_ENOSPC = -6ULL, TRACE_CONDENSING = -7ULL, - TRACE_VDEV_ERROR = -8ULL + TRACE_VDEV_ERROR = -8ULL, + TRACE_INITIALIZING = -9ULL } trace_alloc_type_t; #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) @@ -270,6 +271,11 @@ struct metaslab_group { uint64_t mg_failed_allocations; uint64_t mg_fragmentation; uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + + int mg_ms_initializing; + boolean_t mg_initialize_updating; + kmutex_t mg_ms_initialize_lock; + kcondvar_t mg_ms_initialize_cv; }; /* @@ -360,6 +366,8 @@ struct metaslab { boolean_t ms_condense_wanted; uint64_t ms_condense_checked_txg; + uint64_t ms_initializing; /* leaves initializing this ms */ + /* * We must hold both ms_lock and ms_group->mg_lock in order to * modify ms_loaded. diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 55c306d0a4..34f02ed430 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -650,6 +650,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 +#define SPA_ASYNC_INITIALIZE_RESTART 0x100 /* * Controls the behavior of spa_vdev_remove(). @@ -665,6 +666,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern boolean_t spa_vdev_remove_active(spa_t *spa); +extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index f75233af60..d4ba5669a0 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -79,6 +79,12 @@ typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, uint64_t offset, uint64_t size, void *arg); typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, vdev_remap_cb_t callback, void *arg); +/* + * Given a target vdev, translates the logical range "in" to the physical + * range "res" + */ +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in, + range_seg_t *res); typedef struct vdev_ops { vdev_open_func_t *vdev_op_open; @@ -90,6 +96,11 @@ typedef struct vdev_ops { vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; + /* + * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. + * Used when initializing vdevs. Isn't used by leaf ops. + */ + vdev_xlation_func_t *vdev_op_xlate; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -231,6 +242,24 @@ struct vdev { /* pool checkpoint related */ space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ + + boolean_t vdev_initialize_exit_wanted; + vdev_initializing_state_t vdev_initialize_state; + kthread_t *vdev_initialize_thread; + /* Protects vdev_initialize_thread and vdev_initialize_state. */ + kmutex_t vdev_initialize_lock; + kcondvar_t vdev_initialize_cv; + uint64_t vdev_initialize_offset[TXG_SIZE]; + uint64_t vdev_initialize_last_offset; + range_tree_t *vdev_initialize_tree; /* valid while initializing */ + uint64_t vdev_initialize_bytes_est; + uint64_t vdev_initialize_bytes_done; + time_t vdev_initialize_action_time; /* start and end time */ + + /* for limiting outstanding I/Os */ + kmutex_t vdev_initialize_io_lock; + kcondvar_t vdev_initialize_io_cv; + uint64_t vdev_initialize_inflight; /* * Values stored in the config for an indirect or removing vdev. @@ -434,6 +463,8 @@ extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ +extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in, + range_seg_t *out); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_initialize.h b/usr/src/uts/common/fs/zfs/sys/vdev_initialize.h new file mode 100644 index 0000000000..db4b0572cd --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/vdev_initialize.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INITIALIZE_H +#define _SYS_VDEV_INITIALIZE_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vdev_initialize(vdev_t *vd); +extern void vdev_initialize_stop(vdev_t *vd, + vdev_initializing_state_t tgt_state); +extern void vdev_initialize_stop_all(vdev_t *vd, + vdev_initializing_state_t tgt_state); +extern void vdev_initialize_restart(vdev_t *vd); +extern void vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, + range_seg_t *physical_rs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INITIALIZE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio_priority.h b/usr/src/uts/common/fs/zfs/sys/zio_priority.h index 42ce1ea898..7bd0995728 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_priority.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_priority.h @@ -13,7 +13,7 @@ * CDDL HEADER END */ /* - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ #ifndef _ZIO_PRIORITY_H #define _ZIO_PRIORITY_H @@ -29,6 +29,7 @@ typedef enum zio_priority { ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ + ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ ZIO_PRIORITY_NUM_QUEUEABLE, ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 71b690c123..0c0057e9b6 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -49,6 +49,7 @@ #include #include #include +#include /* * Virtual device management. @@ -183,6 +184,14 @@ vdev_getops(const char *type) return (ops); } +/* ARGSUSED */ +void +vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res) +{ + res->rs_start = in->rs_start; + res->rs_end = in->rs_end; +} + /* * Default asize function: return the MAX of psize with the asize of * all children. This is what's used by anything other than RAID-Z. @@ -453,6 +462,11 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); + for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, NULL); } @@ -725,6 +739,7 @@ void vdev_free(vdev_t *vd) { spa_t *spa = vd->vdev_spa; + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); /* * vdev_free() implies closing the vdev first. This is simpler than @@ -743,6 +758,7 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + ASSERT(vd->vdev_initialize_thread == NULL); /* * Discard allocation state. @@ -815,6 +831,10 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); + mutex_destroy(&vd->vdev_initialize_lock); + mutex_destroy(&vd->vdev_initialize_io_lock); + cv_destroy(&vd->vdev_initialize_io_cv); + cv_destroy(&vd->vdev_initialize_cv); if (vd == spa->spa_root_vdev) spa->spa_root_vdev = NULL; @@ -2841,7 +2861,8 @@ vdev_sync_done(vdev_t *vd, uint64_t txg) ASSERT(vdev_is_concrete(vd)); - while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + != NULL) metaslab_sync_done(msp, txg); if (reassess) @@ -3067,6 +3088,15 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } + /* Restart initializing if necessary */ + mutex_enter(&vd->vdev_initialize_lock); + if (vdev_writeable(vd) && + vd->vdev_initialize_thread == NULL && + vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { + (void) vdev_initialize(vd); + } + mutex_exit(&vd->vdev_initialize_lock); + if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && vd->vdev_state >= VDEV_STATE_DEGRADED)) @@ -3361,8 +3391,18 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; vs->vs_rsize = vdev_get_min_asize(vd); - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf) { vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + /* + * Report intializing progress. Since we don't have the + * initializing locks held, this is only an estimate (although a + * fairly accurate one). + */ + vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done; + vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est; + vs->vs_initialize_state = vd->vdev_initialize_state; + vs->vs_initialize_action_time = vd->vdev_initialize_action_time; + } /* * Report expandable space on top-level, non-auxillary devices only. * The expandable space is reported in terms of metaslab sized units diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 6cb55a3acf..6adc387c88 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -840,6 +840,7 @@ vdev_ops_t vdev_disk_ops = { vdev_disk_hold, vdev_disk_rele, NULL, + vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index f93b646fd8..96534436bb 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ #include @@ -263,6 +263,7 @@ vdev_ops_t vdev_file_ops = { vdev_file_hold, vdev_file_rele, NULL, + vdev_default_xlate, VDEV_TYPE_FILE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -282,6 +283,7 @@ vdev_ops_t vdev_disk_ops = { vdev_file_hold, vdev_file_rele, NULL, + vdev_default_xlate, VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c index 3f2ff799b6..f093a6920f 100644 --- a/usr/src/uts/common/fs/zfs/vdev_indirect.c +++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c @@ -1628,6 +1628,7 @@ vdev_ops_t vdev_indirect_ops = { NULL, NULL, vdev_indirect_remap, + NULL, VDEV_TYPE_INDIRECT, /* name of this vdev type */ B_FALSE /* leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/vdev_initialize.c b/usr/src/uts/common/fs/zfs/vdev_initialize.c new file mode 100644 index 0000000000..559c0153d6 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/vdev_initialize.c @@ -0,0 +1,791 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Maximum number of metaslabs per group that can be initialized + * simultaneously. + */ +int max_initialize_ms = 3; + +/* + * Value that is written to disk during initialization. + */ +uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL; + +/* maximum number of I/Os outstanding per leaf vdev */ +int zfs_initialize_limit = 1; + +/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ +uint64_t zfs_initialize_chunk_size = 1024 * 1024; + +static boolean_t +vdev_initialize_should_stop(vdev_t *vd) +{ + return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || + vd->vdev_detached || vd->vdev_top->vdev_removing); +} + +static void +vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) +{ + /* + * We pass in the guid instead of the vdev_t since the vdev may + * have been freed prior to the sync task being processed. This + * happens when a vdev is detached as we call spa_config_vdev_exit(), + * stop the intializing thread, schedule the sync task, and free + * the vdev. Later when the scheduled sync task is invoked, it would + * find that the vdev has been freed. + */ + uint64_t guid = *(uint64_t *)arg; + uint64_t txg = dmu_tx_get_txg(tx); + kmem_free(arg, sizeof (uint64_t)); + + vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + return; + + uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; + vd->vdev_initialize_offset[txg & TXG_MASK] = 0; + + VERIFY(vd->vdev_leaf_zap != 0); + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + + if (last_offset > 0) { + vd->vdev_initialize_last_offset = last_offset; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, + sizeof (last_offset), 1, &last_offset, tx)); + } + if (vd->vdev_initialize_action_time > 0) { + uint64_t val = (uint64_t)vd->vdev_initialize_action_time; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), + 1, &val, tx)); + } + + uint64_t initialize_state = vd->vdev_initialize_state; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, + &initialize_state, tx)); +} + +static void +vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + spa_t *spa = vd->vdev_spa; + + if (new_state == vd->vdev_initialize_state) + return; + + /* + * Copy the vd's guid, this will be freed by the sync task. + */ + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* + * If we're suspending, then preserving the original start time. + */ + if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { + vd->vdev_initialize_action_time = gethrestime_sec(); + } + vd->vdev_initialize_state = new_state; + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, + guid, 2, ZFS_SPACE_CHECK_RESERVED, tx); + + switch (new_state) { + case VDEV_INITIALIZE_ACTIVE: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s activated", vd->vdev_path); + break; + case VDEV_INITIALIZE_SUSPENDED: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s suspended", vd->vdev_path); + break; + case VDEV_INITIALIZE_CANCELED: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s canceled", vd->vdev_path); + break; + case VDEV_INITIALIZE_COMPLETE: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s complete", vd->vdev_path); + break; + default: + panic("invalid state %llu", (unsigned long long)new_state); + } + + dmu_tx_commit(tx); +} + +static void +vdev_initialize_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + mutex_enter(&vd->vdev_initialize_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the vdev was unavailable; roll the + * last offset back. (This works because spa_sync waits on + * spa_txg_zio before it runs sync tasks.) + */ + uint64_t *off = + &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; + *off = MIN(*off, zio->io_offset); + } else { + /* + * Since initializing is best-effort, we ignore I/O errors and + * rely on vdev_probe to determine if the errors are more + * critical. + */ + if (zio->io_error != 0) + vd->vdev_stat.vs_initialize_errors++; + + vd->vdev_initialize_bytes_done += zio->io_orig_size; + } + ASSERT3U(vd->vdev_initialize_inflight, >, 0); + vd->vdev_initialize_inflight--; + cv_broadcast(&vd->vdev_initialize_io_cv); + mutex_exit(&vd->vdev_initialize_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* Takes care of physical writing and limiting # of concurrent ZIOs. */ +static int +vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) +{ + spa_t *spa = vd->vdev_spa; + + /* Limit inflight initializing I/Os */ + mutex_enter(&vd->vdev_initialize_io_lock); + while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { + cv_wait(&vd->vdev_initialize_io_cv, + &vd->vdev_initialize_io_lock); + } + vd->vdev_initialize_inflight++; + mutex_exit(&vd->vdev_initialize_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_initialize_lock); + + if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* This is the first write of this txg. */ + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_update_sync, guid, 2, + ZFS_SPACE_CHECK_RESERVED, tx); + } + + /* + * We know the vdev struct will still be around since all + * consumers of vdev_free must stop the initialization first. + */ + if (vdev_initialize_should_stop(vd)) { + mutex_enter(&vd->vdev_initialize_io_lock); + ASSERT3U(vd->vdev_initialize_inflight, >, 0); + vd->vdev_initialize_inflight--; + mutex_exit(&vd->vdev_initialize_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_initialize_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_initialize_lock); + + vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; + zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, + size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, + ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); + /* vdev_initialize_cb releases SCL_STATE_ALL */ + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Translate a logical range to the physical range for the specified vdev_t. + * This function is initially called with a leaf vdev and will walk each + * parent vdev until it reaches a top-level vdev. Once the top-level is + * reached the physical range is initialized and the recursive function + * begins to unwind. As it unwinds it calls the parent's vdev specific + * translation function to do the real conversion. + */ +void +vdev_xlate(vdev_t *vd, const range_seg_t *logical_rs, range_seg_t *physical_rs) +{ + /* + * Walk up the vdev tree + */ + if (vd != vd->vdev_top) { + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + } else { + /* + * We've reached the top-level vdev, initialize the + * physical range to the logical range and start to + * unwind. + */ + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; + return; + } + + vdev_t *pvd = vd->vdev_parent; + ASSERT3P(pvd, !=, NULL); + ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); + + /* + * As this recursive function unwinds, translate the logical + * range into its physical components by calling the + * vdev specific translate function. + */ + range_seg_t intermediate = { 0 }; + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + + physical_rs->rs_start = intermediate.rs_start; + physical_rs->rs_end = intermediate.rs_end; +} + +/* + * Callback to fill each ABD chunk with zfs_initialize_value. len must be + * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD + * allocation will guarantee these for us. + */ +/* ARGSUSED */ +static int +vdev_initialize_block_fill(void *buf, size_t len, void *unused) +{ + ASSERT0(len % sizeof (uint64_t)); + for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { + *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; + } + return (0); +} + +static abd_t * +vdev_initialize_block_alloc() +{ + /* Allocate ABD for filler data */ + abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); + + ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); + (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, + vdev_initialize_block_fill, NULL); + + return (data); +} + +static void +vdev_initialize_block_free(abd_t *data) +{ + abd_free(data); +} + +static int +vdev_initialize_ranges(vdev_t *vd, abd_t *data) +{ + avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; + + for (range_seg_t *rs = avl_first(rt); rs != NULL; + rs = AVL_NEXT(rt, rs)) { + uint64_t size = rs->rs_end - rs->rs_start; + + /* Split range into legally-sized physical chunks */ + uint64_t writes_required = + ((size - 1) / zfs_initialize_chunk_size) + 1; + + for (uint64_t w = 0; w < writes_required; w++) { + int error; + + error = vdev_initialize_write(vd, + VDEV_LABEL_START_SIZE + rs->rs_start + + (w * zfs_initialize_chunk_size), + MIN(size - (w * zfs_initialize_chunk_size), + zfs_initialize_chunk_size), data); + if (error != 0) + return (error); + } + } + return (0); +} + +static void +vdev_initialize_ms_load(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + metaslab_load_wait(msp); + if (!msp->ms_loaded) + VERIFY0(metaslab_load(msp)); +} + +static void +vdev_initialize_mg_wait(metaslab_group_t *mg) +{ + ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); + while (mg->mg_initialize_updating) { + cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); + } +} + +static void +vdev_initialize_mg_mark(metaslab_group_t *mg) +{ + ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock)); + ASSERT(mg->mg_initialize_updating); + + while (mg->mg_ms_initializing >= max_initialize_ms) { + cv_wait(&mg->mg_ms_initialize_cv, &mg->mg_ms_initialize_lock); + } + mg->mg_ms_initializing++; + ASSERT3U(mg->mg_ms_initializing, <=, max_initialize_ms); +} + +/* + * Mark the metaslab as being initialized to prevent any allocations + * on this metaslab. We must also track how many metaslabs are currently + * being initialized within a metaslab group and limit them to prevent + * allocation failures from occurring because all metaslabs are being + * initialized. + */ +static void +vdev_initialize_ms_mark(metaslab_t *msp) +{ + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + metaslab_group_t *mg = msp->ms_group; + + mutex_enter(&mg->mg_ms_initialize_lock); + + /* + * To keep an accurate count of how many threads are initializing + * a specific metaslab group, we only allow one thread to mark + * the metaslab group at a time. This ensures that the value of + * ms_initializing will be accurate when we decide to mark a metaslab + * group as being initialized. To do this we force all other threads + * to wait till the metaslab's mg_initialize_updating flag is no + * longer set. + */ + vdev_initialize_mg_wait(mg); + mg->mg_initialize_updating = B_TRUE; + if (msp->ms_initializing == 0) { + vdev_initialize_mg_mark(mg); + } + mutex_enter(&msp->ms_lock); + msp->ms_initializing++; + mutex_exit(&msp->ms_lock); + + mg->mg_initialize_updating = B_FALSE; + cv_broadcast(&mg->mg_ms_initialize_cv); + mutex_exit(&mg->mg_ms_initialize_lock); +} + +static void +vdev_initialize_ms_unmark(metaslab_t *msp) +{ + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + metaslab_group_t *mg = msp->ms_group; + mutex_enter(&mg->mg_ms_initialize_lock); + mutex_enter(&msp->ms_lock); + if (--msp->ms_initializing == 0) { + mg->mg_ms_initializing--; + cv_broadcast(&mg->mg_ms_initialize_cv); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&mg->mg_ms_initialize_lock); +} + +static void +vdev_initialize_calculate_progress(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + vd->vdev_initialize_bytes_est = 0; + vd->vdev_initialize_bytes_done = 0; + + for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + mutex_enter(&msp->ms_lock); + + uint64_t ms_free = msp->ms_size - + space_map_allocated(msp->ms_sm); + + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) + ms_free /= vd->vdev_top->vdev_children; + + /* + * Convert the metaslab range to a physical range + * on our vdev. We use this to determine if we are + * in the middle of this metaslab range. + */ + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = msp->ms_start; + logical_rs.rs_end = msp->ms_start + msp->ms_size; + vdev_xlate(vd, &logical_rs, &physical_rs); + + if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { + vd->vdev_initialize_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } else if (vd->vdev_initialize_last_offset > + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += ms_free; + vd->vdev_initialize_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } + + /* + * If we get here, we're in the middle of initializing this + * metaslab. Load it and walk the free tree for more accurate + * progress estimation. + */ + vdev_initialize_ms_load(msp); + + for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs; + rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { + logical_rs.rs_start = rs->rs_start; + logical_rs.rs_end = rs->rs_end; + vdev_xlate(vd, &logical_rs, &physical_rs); + + uint64_t size = physical_rs.rs_end - + physical_rs.rs_start; + vd->vdev_initialize_bytes_est += size; + if (vd->vdev_initialize_last_offset > + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += size; + } else if (vd->vdev_initialize_last_offset > + physical_rs.rs_start && + vd->vdev_initialize_last_offset < + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += + vd->vdev_initialize_last_offset - + physical_rs.rs_start; + } + } + mutex_exit(&msp->ms_lock); + } +} + +static void +vdev_initialize_load(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || + vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, + sizeof (vd->vdev_initialize_last_offset), 1, + &vd->vdev_initialize_last_offset); + ASSERT(err == 0 || err == ENOENT); + } + + vdev_initialize_calculate_progress(vd); +} + + +/* + * Convert the logical range into a physcial range and add it to our + * avl tree. + */ +void +vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = start; + logical_rs.rs_end = start + size; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + vdev_xlate(vd, &logical_rs, &physical_rs); + + IMPLY(vd->vdev_top == vd, + logical_rs.rs_start == physical_rs.rs_start); + IMPLY(vd->vdev_top == vd, + logical_rs.rs_end == physical_rs.rs_end); + + /* Only add segments that we have not visited yet */ + if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { + zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " + "(%llu, %llu)", vd->vdev_path, + (u_longlong_t)physical_rs.rs_start, + (u_longlong_t)physical_rs.rs_end, + (u_longlong_t)vd->vdev_initialize_last_offset, + (u_longlong_t)physical_rs.rs_end); + ASSERT3U(physical_rs.rs_end, >, + vd->vdev_initialize_last_offset); + physical_rs.rs_start = vd->vdev_initialize_last_offset; + } + ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); + + /* + * With raidz, it's possible that the logical range does not live on + * this leaf vdev. We only add the physical range to this vdev's if it + * has a length greater than 0. + */ + if (physical_rs.rs_end > physical_rs.rs_start) { + range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + } else { + ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); + } +} + +static void +vdev_initialize_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int error = 0; + uint64_t ms_count = 0; + + ASSERT(vdev_is_concrete(vd)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + vd->vdev_initialize_last_offset = 0; + vdev_initialize_load(vd); + + abd_t *deadbeef = vdev_initialize_block_alloc(); + + vd->vdev_initialize_tree = range_tree_create(NULL, NULL); + + for (uint64_t i = 0; !vd->vdev_detached && + i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + + /* + * If we've expanded the top-level vdev or it's our + * first pass, calculate our progress. + */ + if (vd->vdev_top->vdev_ms_count != ms_count) { + vdev_initialize_calculate_progress(vd); + ms_count = vd->vdev_top->vdev_ms_count; + } + + vdev_initialize_ms_mark(msp); + mutex_enter(&msp->ms_lock); + vdev_initialize_ms_load(msp); + + range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, + vd); + mutex_exit(&msp->ms_lock); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + error = vdev_initialize_ranges(vd, deadbeef); + vdev_initialize_ms_unmark(msp); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); + if (error != 0) + break; + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vd->vdev_initialize_io_lock); + while (vd->vdev_initialize_inflight > 0) { + cv_wait(&vd->vdev_initialize_io_cv, + &vd->vdev_initialize_io_lock); + } + mutex_exit(&vd->vdev_initialize_io_lock); + + range_tree_destroy(vd->vdev_initialize_tree); + vdev_initialize_block_free(deadbeef); + vd->vdev_initialize_tree = NULL; + + mutex_enter(&vd->vdev_initialize_lock); + if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { + vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); + } + ASSERT(vd->vdev_initialize_thread != NULL || + vd->vdev_initialize_inflight == 0); + + /* + * Drop the vdev_initialize_lock while we sync out the + * txg since it's possible that a device might be trying to + * come online and must check to see if it needs to restart an + * initialization. That thread will be holding the spa_config_lock + * which would prevent the txg_wait_synced from completing. + */ + mutex_exit(&vd->vdev_initialize_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&vd->vdev_initialize_lock); + + vd->vdev_initialize_thread = NULL; + cv_broadcast(&vd->vdev_initialize_cv); + mutex_exit(&vd->vdev_initialize_lock); +} + +/* + * Initiates a device. Caller must hold vdev_initialize_lock. + * Device must be a leaf and not already be initializing. + */ +void +vdev_initialize(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_initialize_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + + vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); + vd->vdev_initialize_thread = thread_create(NULL, 0, + vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); +} + +/* + * Stop initializng a device, with the resultant initialing state being + * tgt_state. Blocks until the initializing thread has exited. + * Caller must hold vdev_initialize_lock and must not be writing to the spa + * config, as the initializing thread may try to enter the config as a reader + * before exiting. + */ +void +vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + spa_t *spa = vd->vdev_spa; + ASSERT(!spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_WRITER)); + + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + + /* + * Allow cancel requests to proceed even if the initialize thread + * has stopped. + */ + if (vd->vdev_initialize_thread == NULL && + tgt_state != VDEV_INITIALIZE_CANCELED) { + return; + } + + vdev_initialize_change_state(vd, tgt_state); + vd->vdev_initialize_exit_wanted = B_TRUE; + while (vd->vdev_initialize_thread != NULL) + cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); + + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + vd->vdev_initialize_exit_wanted = B_FALSE; +} + +static void +vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop(vd, tgt_state); + mutex_exit(&vd->vdev_initialize_lock); + return; + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state); + } +} + +/* + * Convenience function to stop initializing of a vdev tree and set all + * initialize thread pointers to NULL. + */ +void +vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + vdev_initialize_stop_all_impl(vd, tgt_state); + + if (vd->vdev_spa->spa_sync_on) { + /* Make sure that our state has been synced to disk */ + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + } +} + +void +vdev_initialize_restart(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + + if (vd->vdev_leaf_zap != 0) { + mutex_enter(&vd->vdev_initialize_lock); + uint64_t initialize_state = VDEV_INITIALIZE_NONE; + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, + sizeof (initialize_state), 1, &initialize_state); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_initialize_state = initialize_state; + + uint64_t timestamp = 0; + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, + sizeof (timestamp), 1, ×tamp); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_initialize_action_time = (time_t)timestamp; + + if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vd->vdev_offline) { + /* load progress for reporting, but don't resume */ + vdev_initialize_load(vd); + } else if (vd->vdev_initialize_state == + VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd)) { + vdev_initialize(vd); + } + + mutex_exit(&vd->vdev_initialize_lock); + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_initialize_restart(vd->vdev_child[i]); + } +} diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index 34a750fe4d..133558d3d3 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -564,6 +564,7 @@ vdev_ops_t vdev_mirror_ops = { NULL, NULL, NULL, + vdev_default_xlate, VDEV_TYPE_MIRROR, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -578,6 +579,7 @@ vdev_ops_t vdev_replacing_ops = { NULL, NULL, NULL, + vdev_default_xlate, VDEV_TYPE_REPLACING, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; @@ -592,6 +594,7 @@ vdev_ops_t vdev_spare_ops = { NULL, NULL, NULL, + vdev_default_xlate, VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c index d7d017fb8f..c761de8a20 100644 --- a/usr/src/uts/common/fs/zfs/vdev_missing.c +++ b/usr/src/uts/common/fs/zfs/vdev_missing.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ /* @@ -89,6 +89,7 @@ vdev_ops_t vdev_missing_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_MISSING, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; @@ -103,6 +104,7 @@ vdev_ops_t vdev_hole_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_HOLE, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index f0cca0538d..f29f4eeb9d 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -150,6 +150,8 @@ uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_max_active = 2; +uint32_t zfs_vdev_initializing_min_active = 1; +uint32_t zfs_vdev_initializing_max_active = 1; /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -407,6 +409,8 @@ vdev_queue_class_min_active(zio_priority_t p) return (zfs_vdev_scrub_min_active); case ZIO_PRIORITY_REMOVAL: return (zfs_vdev_removal_min_active); + case ZIO_PRIORITY_INITIALIZING: + return (zfs_vdev_initializing_min_active); default: panic("invalid priority %u", p); return (0); @@ -468,6 +472,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_REMOVAL: return (zfs_vdev_removal_max_active); + case ZIO_PRIORITY_INITIALIZING: + return (zfs_vdev_initializing_max_active); default: panic("invalid priority %u", p); return (0); @@ -688,8 +694,8 @@ again: } /* - * For LBA-ordered queues (async / scrub), issue the i/o which follows - * the most recently issued i/o in LBA (offset) order. + * For LBA-ordered queues (async / scrub / initializing), issue the + * i/o which follows the most recently issued i/o in LBA (offset) order. * * For FIFO queues (sync), issue the i/o with the lowest timestamp. */ @@ -745,13 +751,15 @@ vdev_queue_io(zio_t *zio) if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB && - zio->io_priority != ZIO_PRIORITY_REMOVAL) + zio->io_priority != ZIO_PRIORITY_REMOVAL && + zio->io_priority != ZIO_PRIORITY_INITIALIZING) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; } else { ASSERT(zio->io_type == ZIO_TYPE_WRITE); if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && - zio->io_priority != ZIO_PRIORITY_REMOVAL) + zio->io_priority != ZIO_PRIORITY_REMOVAL && + zio->io_priority != ZIO_PRIORITY_INITIALIZING) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; } diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 60360a0a46..0e6dfcc2c0 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -38,6 +38,10 @@ #include #include +#ifdef ZFS_DEBUG +#include /* vdev_xlate testing */ +#endif + /* * Virtual device vector for RAID-Z. * @@ -1884,6 +1888,39 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_skipped = 0; } +static void +vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) +{ +#ifdef ZFS_DEBUG + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + + range_seg_t logical_rs, physical_rs; + logical_rs.rs_start = zio->io_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_raidz_asize(zio->io_vd, zio->io_size); + + raidz_col_t *rc = &rm->rm_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + vdev_xlate(cvd, &logical_rs, &physical_rs); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + /* + * It would be nice to assert that rs_end is equal + * to rc_offset + rc_size but there might be an + * optional I/O at the end that is not accounted in + * rc_size. + */ + if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + + rc->rc_size + (1 << tvd->vdev_ashift)); + } else { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); + } +#endif +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1926,6 +1963,12 @@ vdev_raidz_io_start(zio_t *zio) for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_devidx]; + + /* + * Verify physical to logical translation. + */ + vdev_raidz_io_verify(zio, rm, c); + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, 0, @@ -2555,6 +2598,37 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } +static void +vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) +{ + vdev_t *raidvd = cvd->vdev_parent; + ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); + + uint64_t width = raidvd->vdev_children; + uint64_t tgt_col = cvd->vdev_id; + uint64_t ashift = raidvd->vdev_top->vdev_ashift; + + /* make sure the offsets are block-aligned */ + ASSERT0(in->rs_start % (1 << ashift)); + ASSERT0(in->rs_end % (1 << ashift)); + uint64_t b_start = in->rs_start >> ashift; + uint64_t b_end = in->rs_end >> ashift; + + uint64_t start_row = 0; + if (b_start > tgt_col) /* avoid underflow */ + start_row = ((b_start - tgt_col - 1) / width) + 1; + + uint64_t end_row = 0; + if (b_end > tgt_col) + end_row = ((b_end - tgt_col - 1) / width) + 1; + + res->rs_start = start_row << ashift; + res->rs_end = end_row << ashift; + + ASSERT3U(res->rs_start, <=, in->rs_start); + ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); +} + vdev_ops_t vdev_raidz_ops = { vdev_raidz_open, vdev_raidz_close, @@ -2565,6 +2639,7 @@ vdev_ops_t vdev_raidz_ops = { NULL, NULL, NULL, + vdev_raidz_xlate, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/vdev_removal.c b/usr/src/uts/common/fs/zfs/vdev_removal.c index fc613ff58a..f913432bd0 100644 --- a/usr/src/uts/common/fs/zfs/vdev_removal.c +++ b/usr/src/uts/common/fs/zfs/vdev_removal.c @@ -44,6 +44,7 @@ #include #include #include +#include /* * This file contains the necessary logic to remove vdevs from a @@ -1021,6 +1022,7 @@ vdev_remove_complete(spa_t *spa) txg_wait_synced(spa->spa_dsl_pool, 0); txg = spa_vdev_enter(spa); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); sysevent_t *ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_DEV); @@ -1659,6 +1661,9 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) /* Make sure these changes are sync'ed */ spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); + /* Stop initializing */ + (void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); + *txg = spa_vdev_config_enter(spa); sysevent_t *ev = spa_event_create(spa, vd, NULL, @@ -1819,6 +1824,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) */ error = spa_reset_logs(spa); + /* + * We stop any initializing that is currently in progress but leave + * the state as "active". This will allow the initializing to resume + * if the removal is canceled sometime later. + */ + vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); + *txg = spa_vdev_config_enter(spa); /* @@ -1830,6 +1842,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) if (error != 0) { metaslab_group_activate(mg); + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); return (error); } diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c index b3433c2424..edb52d6ca7 100644 --- a/usr/src/uts/common/fs/zfs/vdev_root.c +++ b/usr/src/uts/common/fs/zfs/vdev_root.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #include @@ -149,6 +149,7 @@ vdev_ops_t vdev_root_ops = { NULL, NULL, NULL, + NULL, VDEV_TYPE_ROOT, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 08170fedaa..10473579a5 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -189,6 +189,8 @@ #include #include #include +#include +#include #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -3707,6 +3709,80 @@ zfs_ioc_destroy(zfs_cmd_t *zc) } /* + * innvl: { + * vdevs: { + * guid 1, guid 2, ... + * }, + * func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND} + * } + * + * outnvl: { + * [func: EINVAL (if provided command type didn't make sense)], + * [vdevs: { + * guid1: errno, (see function body for possible errnos) + * ... + * }] + * } + * + */ +static int +zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + + error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + uint64_t cmd_type; + if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, + &cmd_type) != 0) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + if (!(cmd_type == POOL_INITIALIZE_CANCEL || + cmd_type == POOL_INITIALIZE_DO || + cmd_type == POOL_INITIALIZE_SUSPEND)) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + nvlist_t *vdev_guids; + if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, + &vdev_guids) != 0) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + nvlist_t *vdev_errlist = fnvlist_alloc(); + int total_errors = 0; + + for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); + pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { + uint64_t vdev_guid = fnvpair_value_uint64(pair); + + error = spa_vdev_initialize(spa, vdev_guid, cmd_type); + if (error != 0) { + char guid_as_str[MAXNAMELEN]; + + (void) snprintf(guid_as_str, sizeof (guid_as_str), + "%llu", (unsigned long long)vdev_guid); + fnvlist_add_int64(vdev_errlist, guid_as_str, error); + total_errors++; + } + } + if (fnvlist_size(vdev_errlist) > 0) { + fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, + vdev_errlist); + } + fnvlist_free(vdev_errlist); + + spa_close(spa, FTAG); + return (total_errors > 0 ? EINVAL : 0); +} + +/* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot @@ -5869,6 +5945,10 @@ zfs_ioctl_init(void) zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, + zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index 790514ddef..4325a502fe 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -626,6 +626,13 @@ typedef struct zpool_load_policy { #define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ "com.delphix:pool_checkpoint_sm" +#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ + "com.delphix:next_offset_to_initialize" +#define VDEV_LEAF_ZAP_INITIALIZE_STATE \ + "com.delphix:vdev_initialize_state" +#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ + "com.delphix:vdev_initialize_action_time" + /* * This is needed in userland to report the minimum necessary device size. * @@ -723,6 +730,15 @@ typedef enum pool_scrub_cmd { POOL_SCRUB_FLAGS_END } pool_scrub_cmd_t; +/* + * Initialize functions. + */ +typedef enum pool_initialize_func { + POOL_INITIALIZE_DO, + POOL_INITIALIZE_CANCEL, + POOL_INITIALIZE_SUSPEND, + POOL_INITIALIZE_FUNCS +} pool_initialize_func_t; /* * ZIO types. Needed to interpret vdev statistics below. @@ -796,6 +812,14 @@ typedef struct pool_checkpoint_stat { uint64_t pcs_space; /* checkpointed space */ } pool_checkpoint_stat_t; +typedef enum { + VDEV_INITIALIZE_NONE, + VDEV_INITIALIZE_ACTIVE, + VDEV_INITIALIZE_CANCELED, + VDEV_INITIALIZE_SUSPENDED, + VDEV_INITIALIZE_COMPLETE +} vdev_initializing_state_t; + /* * Vdev statistics. Note: all fields should be 64-bit because this * is passed between kernel and userland as an nvlist uint64 array. @@ -814,10 +838,15 @@ typedef struct vdev_stat { uint64_t vs_read_errors; /* read errors */ uint64_t vs_write_errors; /* write errors */ uint64_t vs_checksum_errors; /* checksum errors */ + uint64_t vs_initialize_errors; /* initializing errors */ uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ uint64_t vs_fragmentation; /* device fragmentation */ + uint64_t vs_initialize_bytes_done; /* bytes initialized */ + uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ + uint64_t vs_initialize_state; /* vdev_initialzing_state_t */ + uint64_t vs_initialize_action_time; /* time_t */ uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ } vdev_stat_t; @@ -945,6 +974,7 @@ typedef enum zfs_ioc { ZFS_IOC_REMAP, ZFS_IOC_POOL_CHECKPOINT, ZFS_IOC_POOL_DISCARD_CHECKPOINT, + ZFS_IOC_POOL_INITIALIZE, ZFS_IOC_LAST } zfs_ioc_t; @@ -1008,6 +1038,12 @@ typedef enum { #define ZPOOL_HIST_ERRNO "errno" /* + * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. + */ +#define ZPOOL_INITIALIZE_COMMAND "initialize_command" +#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" + +/* * Flags for ZFS_IOC_VDEV_SET_STATE */ #define ZFS_ONLINE_CHECKREMOVE 0x1 -- 2.11.4.GIT