usr/src/lib/libzfs_core/common/libzfs_core.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  25  * Copyright (c) 2014 Integros [integros.com]
  26  * Copyright 2017 RackTop Systems.
  27  */
  28
  29 /*
  30  * LibZFS_Core (lzc) is intended to replace most functionality in libzfs.
  31  * It has the following characteristics:
  32  *
  33  *  - Thread Safe.  libzfs_core is accessible concurrently from multiple
  34  *  threads.  This is accomplished primarily by avoiding global data
  35  *  (e.g. caching).  Since it's thread-safe, there is no reason for a
  36  *  process to have multiple libzfs "instances".  Therefore, we store
  37  *  our few pieces of data (e.g. the file descriptor) in global
  38  *  variables.  The fd is reference-counted so that the libzfs_core
  39  *  library can be "initialized" multiple times (e.g. by different
  40  *  consumers within the same process).
  41  *
  42  *  - Committed Interface.  The libzfs_core interface will be committed,
  43  *  therefore consumers can compile against it and be confident that
  44  *  their code will continue to work on future releases of this code.
  45  *  Currently, the interface is Evolving (not Committed), but we intend
  46  *  to commit to it once it is more complete and we determine that it
  47  *  meets the needs of all consumers.
  48  *
  49  *  - Programatic Error Handling.  libzfs_core communicates errors with
  50  *  defined error numbers, and doesn't print anything to stdout/stderr.
  51  *
  52  *  - Thin Layer.  libzfs_core is a thin layer, marshaling arguments
  53  *  to/from the kernel ioctls.  There is generally a 1:1 correspondence
  54  *  between libzfs_core functions and ioctls to /dev/zfs.
  55  *
  56  *  - Clear Atomicity.  Because libzfs_core functions are generally 1:1
  57  *  with kernel ioctls, and kernel ioctls are general atomic, each
  58  *  libzfs_core function is atomic.  For example, creating multiple
  59  *  snapshots with a single call to lzc_snapshot() is atomic -- it
  60  *  can't fail with only some of the requested snapshots created, even
  61  *  in the event of power loss or system crash.
  62  *
  63  *  - Continued libzfs Support.  Some higher-level operations (e.g.
  64  *  support for "zfs send -R") are too complicated to fit the scope of
  65  *  libzfs_core.  This functionality will continue to live in libzfs.
  66  *  Where appropriate, libzfs will use the underlying atomic operations
  67  *  of libzfs_core.  For example, libzfs may implement "zfs send -R |
  68  *  zfs receive" by using individual "send one snapshot", rename,
  69  *  destroy, and "receive one snapshot" operations in libzfs_core.
  70  *  /sbin/zfs and /zbin/zpool will link with both libzfs and
  71  *  libzfs_core.  Other consumers should aim to use only libzfs_core,
  72  *  since that will be the supported, stable interface going forwards.
  73  */
  74
  75 #include <libzfs_core.h>
  76 #include <ctype.h>
  77 #include <unistd.h>
  78 #include <stdlib.h>
  79 #include <string.h>
  80 #include <errno.h>
  81 #include <fcntl.h>
  82 #include <pthread.h>
  83 #include <sys/nvpair.h>
  84 #include <sys/param.h>
  85 #include <sys/types.h>
  86 #include <sys/stat.h>
  87 #include <sys/zfs_ioctl.h>
  88
  89 static int g_fd = -1;
  90 static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER;
  91 static int g_refcount;
  92
  93 int
  94 libzfs_core_init(void)
  95 {
  96         (void) pthread_mutex_lock(&g_lock);
  97         if (g_refcount == 0) {
  98                 g_fd = open("/dev/zfs", O_RDWR);
  99                 if (g_fd < 0) {
 100                         (void) pthread_mutex_unlock(&g_lock);
 101                         return (errno);
 102                 }
 103         }
 104         g_refcount++;
 105         (void) pthread_mutex_unlock(&g_lock);
 106         return (0);
 107 }
 108
 109 void
 110 libzfs_core_fini(void)
 111 {
 112         (void) pthread_mutex_lock(&g_lock);
 113         ASSERT3S(g_refcount, >, 0);
 114
 115         if (g_refcount > 0)
 116                 g_refcount--;
 117
 118         if (g_refcount == 0 && g_fd != -1) {
 119                 (void) close(g_fd);
 120                 g_fd = -1;
 121         }
 122         (void) pthread_mutex_unlock(&g_lock);
 123 }
 124
 125 static int
 126 lzc_ioctl(zfs_ioc_t ioc, const char *name,
 127     nvlist_t *source, nvlist_t **resultp)
 128 {
 129         zfs_cmd_t zc = { 0 };
 130         int error = 0;
 131         char *packed;
 132         size_t size;
 133
 134         ASSERT3S(g_refcount, >, 0);
 135         VERIFY3S(g_fd, !=, -1);
 136
 137         (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
 138
 139         packed = fnvlist_pack(source, &size);
 140         zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
 141         zc.zc_nvlist_src_size = size;
 142
 143         if (resultp != NULL) {
 144                 *resultp = NULL;
 145                 if (ioc == ZFS_IOC_CHANNEL_PROGRAM) {
 146                         zc.zc_nvlist_dst_size = fnvlist_lookup_uint64(source,
 147                             ZCP_ARG_MEMLIMIT);
 148                 } else {
 149                         zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024);
 150                 }
 151                 zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
 152                     malloc(zc.zc_nvlist_dst_size);
 153                 if (zc.zc_nvlist_dst == (uintptr_t)NULL) {
 154                         error = ENOMEM;
 155                         goto out;
 156                 }
 157         }
 158
 159         while (ioctl(g_fd, ioc, &zc) != 0) {
 160                 /*
 161                  * If ioctl exited with ENOMEM, we retry the ioctl after
 162                  * increasing the size of the destination nvlist.
 163                  *
 164                  * Channel programs that exit with ENOMEM ran over the
 165                  * lua memory sandbox; they should not be retried.
 166                  */
 167                 if (errno == ENOMEM && resultp != NULL &&
 168                     ioc != ZFS_IOC_CHANNEL_PROGRAM) {
 169                         free((void *)(uintptr_t)zc.zc_nvlist_dst);
 170                         zc.zc_nvlist_dst_size *= 2;
 171                         zc.zc_nvlist_dst = (uint64_t)(uintptr_t)
 172                             malloc(zc.zc_nvlist_dst_size);
 173                         if (zc.zc_nvlist_dst == (uintptr_t)NULL) {
 174                                 error = ENOMEM;
 175                                 goto out;
 176                         }
 177                 } else {
 178                         error = errno;
 179                         break;
 180                 }
 181         }
 182         if (zc.zc_nvlist_dst_filled) {
 183                 *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
 184                     zc.zc_nvlist_dst_size);
 185         }
 186
 187 out:
 188         fnvlist_pack_free(packed, size);
 189         free((void *)(uintptr_t)zc.zc_nvlist_dst);
 190         return (error);
 191 }
 192
 193 int
 194 lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props)
 195 {
 196         int error;
 197         nvlist_t *args = fnvlist_alloc();
 198         fnvlist_add_int32(args, "type", (dmu_objset_type_t)type);
 199         if (props != NULL)
 200                 fnvlist_add_nvlist(args, "props", props);
 201         error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL);
 202         nvlist_free(args);
 203         return (error);
 204 }
 205
 206 int
 207 lzc_clone(const char *fsname, const char *origin,
 208     nvlist_t *props)
 209 {
 210         int error;
 211         nvlist_t *args = fnvlist_alloc();
 212         fnvlist_add_string(args, "origin", origin);
 213         if (props != NULL)
 214                 fnvlist_add_nvlist(args, "props", props);
 215         error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL);
 216         nvlist_free(args);
 217         return (error);
 218 }
 219
 220 int
 221 lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen)
 222 {
 223         /*
 224          * The promote ioctl is still legacy, so we need to construct our
 225          * own zfs_cmd_t rather than using lzc_ioctl().
 226          */
 227         zfs_cmd_t zc = { 0 };
 228
 229         ASSERT3S(g_refcount, >, 0);
 230         VERIFY3S(g_fd, !=, -1);
 231
 232         (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name));
 233         if (ioctl(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) {
 234                 int error = errno;
 235                 if (error == EEXIST && snapnamebuf != NULL)
 236                         (void) strlcpy(snapnamebuf, zc.zc_string, snapnamelen);
 237                 return (error);
 238         }
 239         return (0);
 240 }
 241
 242 int
 243 lzc_remap(const char *fsname)
 244 {
 245         int error;
 246         nvlist_t *args = fnvlist_alloc();
 247         error = lzc_ioctl(ZFS_IOC_REMAP, fsname, args, NULL);
 248         nvlist_free(args);
 249         return (error);
 250 }
 251
 252 /*
 253  * Creates snapshots.
 254  *
 255  * The keys in the snaps nvlist are the snapshots to be created.
 256  * They must all be in the same pool.
 257  *
 258  * The props nvlist is properties to set.  Currently only user properties
 259  * are supported.  { user:prop_name -> string value }
 260  *
 261  * The returned results nvlist will have an entry for each snapshot that failed.
 262  * The value will be the (int32) error code.
 263  *
 264  * The return value will be 0 if all snapshots were created, otherwise it will
 265  * be the errno of a (unspecified) snapshot that failed.
 266  */
 267 int
 268 lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist)
 269 {
 270         nvpair_t *elem;
 271         nvlist_t *args;
 272         int error;
 273         char pool[ZFS_MAX_DATASET_NAME_LEN];
 274
 275         *errlist = NULL;
 276
 277         /* determine the pool name */
 278         elem = nvlist_next_nvpair(snaps, NULL);
 279         if (elem == NULL)
 280                 return (0);
 281         (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 282         pool[strcspn(pool, "/@")] = '\0';
 283
 284         args = fnvlist_alloc();
 285         fnvlist_add_nvlist(args, "snaps", snaps);
 286         if (props != NULL)
 287                 fnvlist_add_nvlist(args, "props", props);
 288
 289         error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist);
 290         nvlist_free(args);
 291
 292         return (error);
 293 }
 294
 295 /*
 296  * Destroys snapshots.
 297  *
 298  * The keys in the snaps nvlist are the snapshots to be destroyed.
 299  * They must all be in the same pool.
 300  *
 301  * Snapshots that do not exist will be silently ignored.
 302  *
 303  * If 'defer' is not set, and a snapshot has user holds or clones, the
 304  * destroy operation will fail and none of the snapshots will be
 305  * destroyed.
 306  *
 307  * If 'defer' is set, and a snapshot has user holds or clones, it will be
 308  * marked for deferred destruction, and will be destroyed when the last hold
 309  * or clone is removed/destroyed.
 310  *
 311  * The return value will be 0 if all snapshots were destroyed (or marked for
 312  * later destruction if 'defer' is set) or didn't exist to begin with.
 313  *
 314  * Otherwise the return value will be the errno of a (unspecified) snapshot
 315  * that failed, no snapshots will be destroyed, and the errlist will have an
 316  * entry for each snapshot that failed.  The value in the errlist will be
 317  * the (int32) error code.
 318  */
 319 int
 320 lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist)
 321 {
 322         nvpair_t *elem;
 323         nvlist_t *args;
 324         int error;
 325         char pool[ZFS_MAX_DATASET_NAME_LEN];
 326
 327         /* determine the pool name */
 328         elem = nvlist_next_nvpair(snaps, NULL);
 329         if (elem == NULL)
 330                 return (0);
 331         (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 332         pool[strcspn(pool, "/@")] = '\0';
 333
 334         args = fnvlist_alloc();
 335         fnvlist_add_nvlist(args, "snaps", snaps);
 336         if (defer)
 337                 fnvlist_add_boolean(args, "defer");
 338
 339         error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist);
 340         nvlist_free(args);
 341
 342         return (error);
 343 }
 344
 345 int
 346 lzc_snaprange_space(const char *firstsnap, const char *lastsnap,
 347     uint64_t *usedp)
 348 {
 349         nvlist_t *args;
 350         nvlist_t *result;
 351         int err;
 352         char fs[ZFS_MAX_DATASET_NAME_LEN];
 353         char *atp;
 354
 355         /* determine the fs name */
 356         (void) strlcpy(fs, firstsnap, sizeof (fs));
 357         atp = strchr(fs, '@');
 358         if (atp == NULL)
 359                 return (EINVAL);
 360         *atp = '\0';
 361
 362         args = fnvlist_alloc();
 363         fnvlist_add_string(args, "firstsnap", firstsnap);
 364
 365         err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result);
 366         nvlist_free(args);
 367         if (err == 0)
 368                 *usedp = fnvlist_lookup_uint64(result, "used");
 369         fnvlist_free(result);
 370
 371         return (err);
 372 }
 373
 374 boolean_t
 375 lzc_exists(const char *dataset)
 376 {
 377         /*
 378          * The objset_stats ioctl is still legacy, so we need to construct our
 379          * own zfs_cmd_t rather than using lzc_ioctl().
 380          */
 381         zfs_cmd_t zc = { 0 };
 382
 383         ASSERT3S(g_refcount, >, 0);
 384         VERIFY3S(g_fd, !=, -1);
 385
 386         (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 387         return (ioctl(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0);
 388 }
 389
 390 /*
 391  * Create "user holds" on snapshots.  If there is a hold on a snapshot,
 392  * the snapshot can not be destroyed.  (However, it can be marked for deletion
 393  * by lzc_destroy_snaps(defer=B_TRUE).)
 394  *
 395  * The keys in the nvlist are snapshot names.
 396  * The snapshots must all be in the same pool.
 397  * The value is the name of the hold (string type).
 398  *
 399  * If cleanup_fd is not -1, it must be the result of open("/dev/zfs", O_EXCL).
 400  * In this case, when the cleanup_fd is closed (including on process
 401  * termination), the holds will be released.  If the system is shut down
 402  * uncleanly, the holds will be released when the pool is next opened
 403  * or imported.
 404  *
 405  * Holds for snapshots which don't exist will be skipped and have an entry
 406  * added to errlist, but will not cause an overall failure.
 407  *
 408  * The return value will be 0 if all holds, for snapshots that existed,
 409  * were succesfully created.
 410  *
 411  * Otherwise the return value will be the errno of a (unspecified) hold that
 412  * failed and no holds will be created.
 413  *
 414  * In all cases the errlist will have an entry for each hold that failed
 415  * (name = snapshot), with its value being the error code (int32).
 416  */
 417 int
 418 lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist)
 419 {
 420         char pool[ZFS_MAX_DATASET_NAME_LEN];
 421         nvlist_t *args;
 422         nvpair_t *elem;
 423         int error;
 424
 425         /* determine the pool name */
 426         elem = nvlist_next_nvpair(holds, NULL);
 427         if (elem == NULL)
 428                 return (0);
 429         (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 430         pool[strcspn(pool, "/@")] = '\0';
 431
 432         args = fnvlist_alloc();
 433         fnvlist_add_nvlist(args, "holds", holds);
 434         if (cleanup_fd != -1)
 435                 fnvlist_add_int32(args, "cleanup_fd", cleanup_fd);
 436
 437         error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist);
 438         nvlist_free(args);
 439         return (error);
 440 }
 441
 442 /*
 443  * Release "user holds" on snapshots.  If the snapshot has been marked for
 444  * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have
 445  * any clones, and all the user holds are removed, then the snapshot will be
 446  * destroyed.
 447  *
 448  * The keys in the nvlist are snapshot names.
 449  * The snapshots must all be in the same pool.
 450  * The value is a nvlist whose keys are the holds to remove.
 451  *
 452  * Holds which failed to release because they didn't exist will have an entry
 453  * added to errlist, but will not cause an overall failure.
 454  *
 455  * The return value will be 0 if the nvl holds was empty or all holds that
 456  * existed, were successfully removed.
 457  *
 458  * Otherwise the return value will be the errno of a (unspecified) hold that
 459  * failed to release and no holds will be released.
 460  *
 461  * In all cases the errlist will have an entry for each hold that failed to
 462  * to release.
 463  */
 464 int
 465 lzc_release(nvlist_t *holds, nvlist_t **errlist)
 466 {
 467         char pool[ZFS_MAX_DATASET_NAME_LEN];
 468         nvpair_t *elem;
 469
 470         /* determine the pool name */
 471         elem = nvlist_next_nvpair(holds, NULL);
 472         if (elem == NULL)
 473                 return (0);
 474         (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 475         pool[strcspn(pool, "/@")] = '\0';
 476
 477         return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist));
 478 }
 479
 480 /*
 481  * Retrieve list of user holds on the specified snapshot.
 482  *
 483  * On success, *holdsp will be set to a nvlist which the caller must free.
 484  * The keys are the names of the holds, and the value is the creation time
 485  * of the hold (uint64) in seconds since the epoch.
 486  */
 487 int
 488 lzc_get_holds(const char *snapname, nvlist_t **holdsp)
 489 {
 490         int error;
 491         nvlist_t *innvl = fnvlist_alloc();
 492         error = lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, innvl, holdsp);
 493         fnvlist_free(innvl);
 494         return (error);
 495 }
 496
 497 /*
 498  * Generate a zfs send stream for the specified snapshot and write it to
 499  * the specified file descriptor.
 500  *
 501  * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap")
 502  *
 503  * If "from" is NULL, a full (non-incremental) stream will be sent.
 504  * If "from" is non-NULL, it must be the full name of a snapshot or
 505  * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or
 506  * "pool/fs#earlier_bmark").  If non-NULL, the specified snapshot or
 507  * bookmark must represent an earlier point in the history of "snapname").
 508  * It can be an earlier snapshot in the same filesystem or zvol as "snapname",
 509  * or it can be the origin of "snapname"'s filesystem, or an earlier
 510  * snapshot in the origin, etc.
 511  *
 512  * "fd" is the file descriptor to write the send stream to.
 513  *
 514  * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted
 515  * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT
 516  * records with drr_blksz > 128K.
 517  *
 518  * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted
 519  * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA,
 520  * which the receiving system must support (as indicated by support
 521  * for the "embedded_data" feature).
 522  */
 523 int
 524 lzc_send(const char *snapname, const char *from, int fd,
 525     enum lzc_send_flags flags)
 526 {
 527         return (lzc_send_resume(snapname, from, fd, flags, 0, 0));
 528 }
 529
 530 int
 531 lzc_send_resume(const char *snapname, const char *from, int fd,
 532     enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff)
 533 {
 534         nvlist_t *args;
 535         int err;
 536
 537         args = fnvlist_alloc();
 538         fnvlist_add_int32(args, "fd", fd);
 539         if (from != NULL)
 540                 fnvlist_add_string(args, "fromsnap", from);
 541         if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
 542                 fnvlist_add_boolean(args, "largeblockok");
 543         if (flags & LZC_SEND_FLAG_EMBED_DATA)
 544                 fnvlist_add_boolean(args, "embedok");
 545         if (flags & LZC_SEND_FLAG_COMPRESS)
 546                 fnvlist_add_boolean(args, "compressok");
 547         if (resumeobj != 0 || resumeoff != 0) {
 548                 fnvlist_add_uint64(args, "resume_object", resumeobj);
 549                 fnvlist_add_uint64(args, "resume_offset", resumeoff);
 550         }
 551         err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
 552         nvlist_free(args);
 553         return (err);
 554 }
 555
 556 /*
 557  * "from" can be NULL, a snapshot, or a bookmark.
 558  *
 559  * If from is NULL, a full (non-incremental) stream will be estimated.  This
 560  * is calculated very efficiently.
 561  *
 562  * If from is a snapshot, lzc_send_space uses the deadlists attached to
 563  * each snapshot to efficiently estimate the stream size.
 564  *
 565  * If from is a bookmark, the indirect blocks in the destination snapshot
 566  * are traversed, looking for blocks with a birth time since the creation TXG of
 567  * the snapshot this bookmark was created from.  This will result in
 568  * significantly more I/O and be less efficient than a send space estimation on
 569  * an equivalent snapshot.
 570  */
 571 int
 572 lzc_send_space(const char *snapname, const char *from,
 573     enum lzc_send_flags flags, uint64_t *spacep)
 574 {
 575         nvlist_t *args;
 576         nvlist_t *result;
 577         int err;
 578
 579         args = fnvlist_alloc();
 580         if (from != NULL)
 581                 fnvlist_add_string(args, "from", from);
 582         if (flags & LZC_SEND_FLAG_LARGE_BLOCK)
 583                 fnvlist_add_boolean(args, "largeblockok");
 584         if (flags & LZC_SEND_FLAG_EMBED_DATA)
 585                 fnvlist_add_boolean(args, "embedok");
 586         if (flags & LZC_SEND_FLAG_COMPRESS)
 587                 fnvlist_add_boolean(args, "compressok");
 588         err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result);
 589         nvlist_free(args);
 590         if (err == 0)
 591                 *spacep = fnvlist_lookup_uint64(result, "space");
 592         nvlist_free(result);
 593         return (err);
 594 }
 595
 596 static int
 597 recv_read(int fd, void *buf, int ilen)
 598 {
 599         char *cp = buf;
 600         int rv;
 601         int len = ilen;
 602
 603         do {
 604                 rv = read(fd, cp, len);
 605                 cp += rv;
 606                 len -= rv;
 607         } while (rv > 0);
 608
 609         if (rv < 0 || len != 0)
 610                 return (EIO);
 611
 612         return (0);
 613 }
 614
 615 static int
 616 recv_impl(const char *snapname, nvlist_t *props, const char *origin,
 617     boolean_t force, boolean_t resumable, int fd,
 618     const dmu_replay_record_t *begin_record)
 619 {
 620         /*
 621          * The receive ioctl is still legacy, so we need to construct our own
 622          * zfs_cmd_t rather than using zfsc_ioctl().
 623          */
 624         zfs_cmd_t zc = { 0 };
 625         char *atp;
 626         char *packed = NULL;
 627         size_t size;
 628         int error;
 629
 630         ASSERT3S(g_refcount, >, 0);
 631         VERIFY3S(g_fd, !=, -1);
 632
 633         /* zc_name is name of containing filesystem */
 634         (void) strlcpy(zc.zc_name, snapname, sizeof (zc.zc_name));
 635         atp = strchr(zc.zc_name, '@');
 636         if (atp == NULL)
 637                 return (EINVAL);
 638         *atp = '\0';
 639
 640         /* if the fs does not exist, try its parent. */
 641         if (!lzc_exists(zc.zc_name)) {
 642                 char *slashp = strrchr(zc.zc_name, '/');
 643                 if (slashp == NULL)
 644                         return (ENOENT);
 645                 *slashp = '\0';
 646
 647         }
 648
 649         /* zc_value is full name of the snapshot to create */
 650         (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
 651
 652         if (props != NULL) {
 653                 /* zc_nvlist_src is props to set */
 654                 packed = fnvlist_pack(props, &size);
 655                 zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed;
 656                 zc.zc_nvlist_src_size = size;
 657         }
 658
 659         /* zc_string is name of clone origin (if DRR_FLAG_CLONE) */
 660         if (origin != NULL)
 661                 (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string));
 662
 663         /* zc_begin_record is non-byteswapped BEGIN record */
 664         if (begin_record == NULL) {
 665                 error = recv_read(fd, &zc.zc_begin_record,
 666                     sizeof (zc.zc_begin_record));
 667                 if (error != 0)
 668                         goto out;
 669         } else {
 670                 zc.zc_begin_record = *begin_record;
 671         }
 672
 673         /* zc_cookie is fd to read from */
 674         zc.zc_cookie = fd;
 675
 676         /* zc guid is force flag */
 677         zc.zc_guid = force;
 678
 679         zc.zc_resumable = resumable;
 680
 681         /* zc_cleanup_fd is unused */
 682         zc.zc_cleanup_fd = -1;
 683
 684         error = ioctl(g_fd, ZFS_IOC_RECV, &zc);
 685         if (error != 0)
 686                 error = errno;
 687
 688 out:
 689         if (packed != NULL)
 690                 fnvlist_pack_free(packed, size);
 691         free((void*)(uintptr_t)zc.zc_nvlist_dst);
 692         return (error);
 693 }
 694
 695 /*
 696  * The simplest receive case: receive from the specified fd, creating the
 697  * specified snapshot.  Apply the specified properties as "received" properties
 698  * (which can be overridden by locally-set properties).  If the stream is a
 699  * clone, its origin snapshot must be specified by 'origin'.  The 'force'
 700  * flag will cause the target filesystem to be rolled back or destroyed if
 701  * necessary to receive.
 702  *
 703  * Return 0 on success or an errno on failure.
 704  *
 705  * Note: this interface does not work on dedup'd streams
 706  * (those with DMU_BACKUP_FEATURE_DEDUP).
 707  */
 708 int
 709 lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
 710     boolean_t force, int fd)
 711 {
 712         return (recv_impl(snapname, props, origin, force, B_FALSE, fd, NULL));
 713 }
 714
 715 /*
 716  * Like lzc_receive, but if the receive fails due to premature stream
 717  * termination, the intermediate state will be preserved on disk.  In this
 718  * case, ECKSUM will be returned.  The receive may subsequently be resumed
 719  * with a resuming send stream generated by lzc_send_resume().
 720  */
 721 int
 722 lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin,
 723     boolean_t force, int fd)
 724 {
 725         return (recv_impl(snapname, props, origin, force, B_TRUE, fd, NULL));
 726 }
 727
 728 /*
 729  * Like lzc_receive, but allows the caller to read the begin record and then to
 730  * pass it in.  That could be useful if the caller wants to derive, for example,
 731  * the snapname or the origin parameters based on the information contained in
 732  * the begin record.
 733  * The begin record must be in its original form as read from the stream,
 734  * in other words, it should not be byteswapped.
 735  *
 736  * The 'resumable' parameter allows to obtain the same behavior as with
 737  * lzc_receive_resumable.
 738  */
 739 int
 740 lzc_receive_with_header(const char *snapname, nvlist_t *props,
 741     const char *origin, boolean_t force, boolean_t resumable, int fd,
 742     const dmu_replay_record_t *begin_record)
 743 {
 744         if (begin_record == NULL)
 745                 return (EINVAL);
 746         return (recv_impl(snapname, props, origin, force, resumable, fd,
 747             begin_record));
 748 }
 749
 750 /*
 751  * Roll back this filesystem or volume to its most recent snapshot.
 752  * If snapnamebuf is not NULL, it will be filled in with the name
 753  * of the most recent snapshot.
 754  * Note that the latest snapshot may change if a new one is concurrently
 755  * created or the current one is destroyed.  lzc_rollback_to can be used
 756  * to roll back to a specific latest snapshot.
 757  *
 758  * Return 0 on success or an errno on failure.
 759  */
 760 int
 761 lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen)
 762 {
 763         nvlist_t *args;
 764         nvlist_t *result;
 765         int err;
 766
 767         args = fnvlist_alloc();
 768         err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
 769         nvlist_free(args);
 770         if (err == 0 && snapnamebuf != NULL) {
 771                 const char *snapname = fnvlist_lookup_string(result, "target");
 772                 (void) strlcpy(snapnamebuf, snapname, snapnamelen);
 773         }
 774         nvlist_free(result);
 775
 776         return (err);
 777 }
 778
 779 /*
 780  * Roll back this filesystem or volume to the specified snapshot,
 781  * if possible.
 782  *
 783  * Return 0 on success or an errno on failure.
 784  */
 785 int
 786 lzc_rollback_to(const char *fsname, const char *snapname)
 787 {
 788         nvlist_t *args;
 789         nvlist_t *result;
 790         int err;
 791
 792         args = fnvlist_alloc();
 793         fnvlist_add_string(args, "target", snapname);
 794         err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result);
 795         nvlist_free(args);
 796         nvlist_free(result);
 797         return (err);
 798 }
 799
 800 /*
 801  * Creates bookmarks.
 802  *
 803  * The bookmarks nvlist maps from name of the bookmark (e.g. "pool/fs#bmark") to
 804  * the name of the snapshot (e.g. "pool/fs@snap").  All the bookmarks and
 805  * snapshots must be in the same pool.
 806  *
 807  * The returned results nvlist will have an entry for each bookmark that failed.
 808  * The value will be the (int32) error code.
 809  *
 810  * The return value will be 0 if all bookmarks were created, otherwise it will
 811  * be the errno of a (undetermined) bookmarks that failed.
 812  */
 813 int
 814 lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist)
 815 {
 816         nvpair_t *elem;
 817         int error;
 818         char pool[ZFS_MAX_DATASET_NAME_LEN];
 819
 820         /* determine the pool name */
 821         elem = nvlist_next_nvpair(bookmarks, NULL);
 822         if (elem == NULL)
 823                 return (0);
 824         (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 825         pool[strcspn(pool, "/#")] = '\0';
 826
 827         error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist);
 828
 829         return (error);
 830 }
 831
 832 /*
 833  * Retrieve bookmarks.
 834  *
 835  * Retrieve the list of bookmarks for the given file system. The props
 836  * parameter is an nvlist of property names (with no values) that will be
 837  * returned for each bookmark.
 838  *
 839  * The following are valid properties on bookmarks, all of which are numbers
 840  * (represented as uint64 in the nvlist)
 841  *
 842  * "guid" - globally unique identifier of the snapshot it refers to
 843  * "createtxg" - txg when the snapshot it refers to was created
 844  * "creation" - timestamp when the snapshot it refers to was created
 845  *
 846  * The format of the returned nvlist as follows:
 847  * <short name of bookmark> -> {
 848  *     <name of property> -> {
 849  *         "value" -> uint64
 850  *     }
 851  *  }
 852  */
 853 int
 854 lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks)
 855 {
 856         return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks));
 857 }
 858
 859 /*
 860  * Destroys bookmarks.
 861  *
 862  * The keys in the bmarks nvlist are the bookmarks to be destroyed.
 863  * They must all be in the same pool.  Bookmarks are specified as
 864  * <fs>#<bmark>.
 865  *
 866  * Bookmarks that do not exist will be silently ignored.
 867  *
 868  * The return value will be 0 if all bookmarks that existed were destroyed.
 869  *
 870  * Otherwise the return value will be the errno of a (undetermined) bookmark
 871  * that failed, no bookmarks will be destroyed, and the errlist will have an
 872  * entry for each bookmarks that failed.  The value in the errlist will be
 873  * the (int32) error code.
 874  */
 875 int
 876 lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist)
 877 {
 878         nvpair_t *elem;
 879         int error;
 880         char pool[ZFS_MAX_DATASET_NAME_LEN];
 881
 882         /* determine the pool name */
 883         elem = nvlist_next_nvpair(bmarks, NULL);
 884         if (elem == NULL)
 885                 return (0);
 886         (void) strlcpy(pool, nvpair_name(elem), sizeof (pool));
 887         pool[strcspn(pool, "/#")] = '\0';
 888
 889         error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist);
 890
 891         return (error);
 892 }
 893
 894 static int
 895 lzc_channel_program_impl(const char *pool, const char *program, boolean_t sync,
 896     uint64_t instrlimit, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
 897 {
 898         int error;
 899         nvlist_t *args;
 900
 901         args = fnvlist_alloc();
 902         fnvlist_add_string(args, ZCP_ARG_PROGRAM, program);
 903         fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl);
 904         fnvlist_add_boolean_value(args, ZCP_ARG_SYNC, sync);
 905         fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit);
 906         fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit);
 907         error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl);
 908         fnvlist_free(args);
 909
 910         return (error);
 911 }
 912
 913 /*
 914  * Executes a channel program.
 915  *
 916  * If this function returns 0 the channel program was successfully loaded and
 917  * ran without failing. Note that individual commands the channel program ran
 918  * may have failed and the channel program is responsible for reporting such
 919  * errors through outnvl if they are important.
 920  *
 921  * This method may also return:
 922  *
 923  * EINVAL   The program contains syntax errors, or an invalid memory or time
 924  *          limit was given. No part of the channel program was executed.
 925  *          If caused by syntax errors, 'outnvl' contains information about the
 926  *          errors.
 927  *
 928  * ECHRNG   The program was executed, but encountered a runtime error, such as
 929  *          calling a function with incorrect arguments, invoking the error()
 930  *          function directly, failing an assert() command, etc. Some portion
 931  *          of the channel program may have executed and committed changes.
 932  *          Information about the failure can be found in 'outnvl'.
 933  *
 934  * ENOMEM   The program fully executed, but the output buffer was not large
 935  *          enough to store the returned value. No output is returned through
 936  *          'outnvl'.
 937  *
 938  * ENOSPC   The program was terminated because it exceeded its memory usage
 939  *          limit. Some portion of the channel program may have executed and
 940  *          committed changes to disk. No output is returned through 'outnvl'.
 941  *
 942  * ETIME    The program was terminated because it exceeded its Lua instruction
 943  *          limit. Some portion of the channel program may have executed and
 944  *          committed changes to disk. No output is returned through 'outnvl'.
 945  */
 946 int
 947 lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit,
 948     uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
 949 {
 950         return (lzc_channel_program_impl(pool, program, B_TRUE, instrlimit,
 951             memlimit, argnvl, outnvl));
 952 }
 953
 954 /*
 955  * Creates a checkpoint for the specified pool.
 956  *
 957  * If this function returns 0 the pool was successfully checkpointed.
 958  *
 959  * This method may also return:
 960  *
 961  * ZFS_ERR_CHECKPOINT_EXISTS
 962  *      The pool already has a checkpoint. A pools can only have one
 963  *      checkpoint at most, at any given time.
 964  *
 965  * ZFS_ERR_DISCARDING_CHECKPOINT
 966  *      ZFS is in the middle of discarding a checkpoint for this pool.
 967  *      The pool can be checkpointed again once the discard is done.
 968  *
 969  * ZFS_DEVRM_IN_PROGRESS
 970  *      A vdev is currently being removed. The pool cannot be
 971  *      checkpointed until the device removal is done.
 972  *
 973  * ZFS_VDEV_TOO_BIG
 974  *      One or more top-level vdevs exceed the maximum vdev size
 975  *      supported for this feature.
 976  */
 977 int
 978 lzc_pool_checkpoint(const char *pool)
 979 {
 980         int error;
 981
 982         nvlist_t *result = NULL;
 983         nvlist_t *args = fnvlist_alloc();
 984
 985         error = lzc_ioctl(ZFS_IOC_POOL_CHECKPOINT, pool, args, &result);
 986
 987         fnvlist_free(args);
 988         fnvlist_free(result);
 989
 990         return (error);
 991 }
 992
 993 /*
 994  * Discard the checkpoint from the specified pool.
 995  *
 996  * If this function returns 0 the checkpoint was successfully discarded.
 997  *
 998  * This method may also return:
 999  *
1000  * ZFS_ERR_NO_CHECKPOINT
1001  *      The pool does not have a checkpoint.
1002  *
1003  * ZFS_ERR_DISCARDING_CHECKPOINT
1004  *      ZFS is already in the middle of discarding the checkpoint.
1005  */
1006 int
1007 lzc_pool_checkpoint_discard(const char *pool)
1008 {
1009         int error;
1010
1011         nvlist_t *result = NULL;
1012         nvlist_t *args = fnvlist_alloc();
1013
1014         error = lzc_ioctl(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, args, &result);
1015
1016         fnvlist_free(args);
1017         fnvlist_free(result);
1018
1019         return (error);
1020 }
1021
1022 /*
1023  * Executes a read-only channel program.
1024  *
1025  * A read-only channel program works programmatically the same way as a
1026  * normal channel program executed with lzc_channel_program(). The only
1027  * difference is it runs exclusively in open-context and therefore can
1028  * return faster. The downside to that, is that the program cannot change
1029  * on-disk state by calling functions from the zfs.sync submodule.
1030  *
1031  * The return values of this function (and their meaning) are exactly the
1032  * same as the ones described in lzc_channel_program().
1033  */
1034 int
1035 lzc_channel_program_nosync(const char *pool, const char *program,
1036     uint64_t timeout, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
1037 {
1038         return (lzc_channel_program_impl(pool, program, B_FALSE, timeout,
1039             memlimit, argnvl, outnvl));
1040 }
1041
1042 /*
1043  * Changes initializing state.
1044  *
1045  * vdevs should be a list of (<key>, guid) where guid is a uint64 vdev GUID.
1046  * The key is ignored.
1047  *
1048  * If there are errors related to vdev arguments, per-vdev errors are returned
1049  * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where
1050  * guid is stringified with PRIu64, and errno is one of the following as
1051  * an int64_t:
1052  *      - ENODEV if the device was not found
1053  *      - EINVAL if the devices is not a leaf or is not concrete (e.g. missing)
1054  *      - EROFS if the device is not writeable
1055  *      - EBUSY start requested but the device is already being initialized
1056  *      - ESRCH cancel/suspend requested but device is not being initialized
1057  *
1058  * If the errlist is empty, then return value will be:
1059  *      - EINVAL if one or more arguments was invalid
1060  *      - Other spa_open failures
1061  *      - 0 if the operation succeeded
1062  */
1063 int
1064 lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type,
1065     nvlist_t *vdevs, nvlist_t **errlist)
1066 {
1067         int error;
1068         nvlist_t *args = fnvlist_alloc();
1069         fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type);
1070         fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs);
1071
1072         error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist);
1073
1074         fnvlist_free(args);
1075
1076         return (error);
1077 }