usr/src/grub/grub-0.97/stage2/fsys_zfs.c

   1 /*
   2  *  GRUB  --  GRand Unified Bootloader
   3  *  Copyright (C) 1999,2000,2001,2002,2003,2004  Free Software Foundation, Inc.
   4  *
   5  *  This program is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU General Public License as published by
   7  *  the Free Software Foundation; either version 2 of the License, or
   8  *  (at your option) any later version.
   9  *
  10  *  This program is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  *  GNU General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU General Public License
  16  *  along with this program; if not, write to the Free Software
  17  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18  */
  19
  20 /*
  21  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  22  * Use is subject to license terms.
  23  */
  24
  25 /*
  26  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  27  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  28  */
  29
  30 /*
  31  * The zfs plug-in routines for GRUB are:
  32  *
  33  * zfs_mount() - locates a valid uberblock of the root pool and reads
  34  *              in its MOS at the memory address MOS.
  35  *
  36  * zfs_open() - locates a plain file object by following the MOS
  37  *              and places its dnode at the memory address DNODE.
  38  *
  39  * zfs_read() - read in the data blocks pointed by the DNODE.
  40  *
  41  * ZFS_SCRATCH is used as a working area.
  42  *
  43  * (memory addr)   MOS      DNODE       ZFS_SCRATCH
  44  *                  |         |          |
  45  *          +-------V---------V----------V---------------+
  46  *   memory |       | dnode   | dnode    |  scratch      |
  47  *          |       | 512B    | 512B     |  area         |
  48  *          +--------------------------------------------+
  49  */
  50
  51 #ifdef  FSYS_ZFS
  52
  53 #include "shared.h"
  54 #include "filesys.h"
  55 #include "fsys_zfs.h"
  56
  57 /* cache for a file block of the currently zfs_open()-ed file */
  58 static void *file_buf = NULL;
  59 static uint64_t file_start = 0;
  60 static uint64_t file_end = 0;
  61
  62 /* cache for a dnode block */
  63 static dnode_phys_t *dnode_buf = NULL;
  64 static dnode_phys_t *dnode_mdn = NULL;
  65 static uint64_t dnode_start = 0;
  66 static uint64_t dnode_end = 0;
  67
  68 static uint64_t pool_guid = 0;
  69 static uberblock_t current_uberblock;
  70 static char *stackbase;
  71
  72 decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] =
  73 {
  74         {"inherit", 0},                 /* ZIO_COMPRESS_INHERIT */
  75         {"on", lzjb_decompress},        /* ZIO_COMPRESS_ON */
  76         {"off", 0},                     /* ZIO_COMPRESS_OFF */
  77         {"lzjb", lzjb_decompress},      /* ZIO_COMPRESS_LZJB */
  78         {"empty", 0},                   /* ZIO_COMPRESS_EMPTY */
  79         {"gzip-1", 0},                  /* ZIO_COMPRESS_GZIP_1 */
  80         {"gzip-2", 0},                  /* ZIO_COMPRESS_GZIP_2 */
  81         {"gzip-3", 0},                  /* ZIO_COMPRESS_GZIP_3 */
  82         {"gzip-4", 0},                  /* ZIO_COMPRESS_GZIP_4 */
  83         {"gzip-5", 0},                  /* ZIO_COMPRESS_GZIP_5 */
  84         {"gzip-6", 0},                  /* ZIO_COMPRESS_GZIP_6 */
  85         {"gzip-7", 0},                  /* ZIO_COMPRESS_GZIP_7 */
  86         {"gzip-8", 0},                  /* ZIO_COMPRESS_GZIP_8 */
  87         {"gzip-9", 0},                  /* ZIO_COMPRESS_GZIP_9 */
  88         {"zle", 0},                     /* ZIO_COMPRESS_ZLE */
  89         {"lz4", lz4_decompress}         /* ZIO_COMPRESS_LZ4 */
  90 };
  91
  92 static int zio_read_data(blkptr_t *bp, void *buf, char *stack);
  93
  94 /*
  95  * Our own version of bcmp().
  96  */
  97 static int
  98 zfs_bcmp(const void *s1, const void *s2, size_t n)
  99 {
 100         const uchar_t *ps1 = s1;
 101         const uchar_t *ps2 = s2;
 102
 103         if (s1 != s2 && n != 0) {
 104                 do {
 105                         if (*ps1++ != *ps2++)
 106                                 return (1);
 107                 } while (--n != 0);
 108         }
 109
 110         return (0);
 111 }
 112
 113 /*
 114  * Our own version of log2().  Same thing as highbit()-1.
 115  */
 116 static int
 117 zfs_log2(uint64_t num)
 118 {
 119         int i = 0;
 120
 121         while (num > 1) {
 122                 i++;
 123                 num = num >> 1;
 124         }
 125
 126         return (i);
 127 }
 128
 129 /* Checksum Functions */
 130 static void
 131 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
 132 {
 133         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 134 }
 135
 136 /* Checksum Table and Values */
 137 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 138         {{NULL,                 NULL},                  0, 0,   "inherit"},
 139         {{NULL,                 NULL},                  0, 0,   "on"},
 140         {{zio_checksum_off,     zio_checksum_off},      0, 0,   "off"},
 141         {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1,   "label"},
 142         {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 1,   "gang_header"},
 143         {{NULL,                 NULL},                  0, 0,   "zilog"},
 144         {{fletcher_2_native,    fletcher_2_byteswap},   0, 0,   "fletcher2"},
 145         {{fletcher_4_native,    fletcher_4_byteswap},   1, 0,   "fletcher4"},
 146         {{zio_checksum_SHA256,  zio_checksum_SHA256},   1, 0,   "SHA256"},
 147         {{NULL,                 NULL},                  0, 0,   "zilog2"},
 148         {{zio_checksum_off,     zio_checksum_off},      0, 0,   "noparity"},
 149         {{zio_checksum_SHA512,  NULL},                  0, 0,   "SHA512"}
 150 };
 151
 152 /*
 153  * zio_checksum_verify: Provides support for checksum verification.
 154  *
 155  * Fletcher2, Fletcher4, SHA-256 and SHA-512/256 are supported.
 156  *
 157  * Return:
 158  *      -1 = Failure
 159  *       0 = Success
 160  */
 161 static int
 162 zio_checksum_verify(blkptr_t *bp, char *data, int size)
 163 {
 164         zio_cksum_t zc = bp->blk_cksum;
 165         uint32_t checksum = BP_GET_CHECKSUM(bp);
 166         int byteswap = BP_SHOULD_BYTESWAP(bp);
 167         zio_eck_t *zec = (zio_eck_t *)(data + size) - 1;
 168         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 169         zio_cksum_t actual_cksum, expected_cksum;
 170
 171         if (byteswap) {
 172                 grub_printf("byteswap not supported\n");
 173                 return (-1);
 174         }
 175
 176         if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) {
 177                 grub_printf("checksum algorithm %u not supported\n", checksum);
 178                 return (-1);
 179         }
 180
 181         if (ci->ci_eck) {
 182                 expected_cksum = zec->zec_cksum;
 183                 zec->zec_cksum = zc;
 184                 ci->ci_func[0](data, size, &actual_cksum);
 185                 zec->zec_cksum = expected_cksum;
 186                 zc = expected_cksum;
 187         } else {
 188                 ci->ci_func[byteswap](data, size, &actual_cksum);
 189         }
 190
 191         if ((actual_cksum.zc_word[0] - zc.zc_word[0]) |
 192             (actual_cksum.zc_word[1] - zc.zc_word[1]) |
 193             (actual_cksum.zc_word[2] - zc.zc_word[2]) |
 194             (actual_cksum.zc_word[3] - zc.zc_word[3]))
 195                 return (-1);
 196
 197         return (0);
 198 }
 199
 200 /*
 201  * vdev_label_start returns the physical disk offset (in bytes) of
 202  * label "l".
 203  */
 204 static uint64_t
 205 vdev_label_start(uint64_t psize, int l)
 206 {
 207         return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
 208             0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
 209 }
 210
 211 /*
 212  * vdev_uberblock_compare takes two uberblock structures and returns an integer
 213  * indicating the more recent of the two.
 214  *      Return Value = 1 if ub2 is more recent
 215  *      Return Value = -1 if ub1 is more recent
 216  * The most recent uberblock is determined using its transaction number and
 217  * timestamp.  The uberblock with the highest transaction number is
 218  * considered "newer".  If the transaction numbers of the two blocks match, the
 219  * timestamps are compared to determine the "newer" of the two.
 220  */
 221 static int
 222 vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
 223 {
 224         if (ub1->ub_txg < ub2->ub_txg)
 225                 return (-1);
 226         if (ub1->ub_txg > ub2->ub_txg)
 227                 return (1);
 228
 229         if (ub1->ub_timestamp < ub2->ub_timestamp)
 230                 return (-1);
 231         if (ub1->ub_timestamp > ub2->ub_timestamp)
 232                 return (1);
 233
 234         return (0);
 235 }
 236
 237 /*
 238  * Three pieces of information are needed to verify an uberblock: the magic
 239  * number, the version number, and the checksum.
 240  *
 241  * Return:
 242  *     0 - Success
 243  *    -1 - Failure
 244  */
 245 static int
 246 uberblock_verify(uberblock_t *uber, uint64_t ub_size, uint64_t offset)
 247 {
 248         blkptr_t bp;
 249
 250         BP_ZERO(&bp);
 251         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
 252         BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
 253         ZIO_SET_CHECKSUM(&bp.blk_cksum, offset, 0, 0, 0);
 254
 255         if (zio_checksum_verify(&bp, (char *)uber, ub_size) != 0)
 256                 return (-1);
 257
 258         if (uber->ub_magic == UBERBLOCK_MAGIC &&
 259             SPA_VERSION_IS_SUPPORTED(uber->ub_version))
 260                 return (0);
 261
 262         return (-1);
 263 }
 264
 265 /*
 266  * Find the best uberblock.
 267  * Return:
 268  *    Success - Pointer to the best uberblock.
 269  *    Failure - NULL
 270  */
 271 static uberblock_t *
 272 find_bestub(char *ub_array, uint64_t ashift, uint64_t sector)
 273 {
 274         uberblock_t *ubbest = NULL;
 275         uberblock_t *ubnext;
 276         uint64_t offset, ub_size;
 277         int i;
 278
 279         ub_size = VDEV_UBERBLOCK_SIZE(ashift);
 280
 281         for (i = 0; i < VDEV_UBERBLOCK_COUNT(ashift); i++) {
 282                 ubnext = (uberblock_t *)ub_array;
 283                 ub_array += ub_size;
 284                 offset = (sector << SPA_MINBLOCKSHIFT) +
 285                     VDEV_UBERBLOCK_OFFSET(ashift, i);
 286
 287                 if (uberblock_verify(ubnext, ub_size, offset) != 0)
 288                         continue;
 289
 290                 if (ubbest == NULL ||
 291                     vdev_uberblock_compare(ubnext, ubbest) > 0)
 292                         ubbest = ubnext;
 293         }
 294
 295         return (ubbest);
 296 }
 297
 298 /*
 299  * Read a block of data based on the gang block address dva,
 300  * and put its data in buf.
 301  *
 302  * Return:
 303  *      0 - success
 304  *      1 - failure
 305  */
 306 static int
 307 zio_read_gang(blkptr_t *bp, dva_t *dva, void *buf, char *stack)
 308 {
 309         zio_gbh_phys_t *zio_gb;
 310         uint64_t offset, sector;
 311         blkptr_t tmpbp;
 312         int i;
 313
 314         zio_gb = (zio_gbh_phys_t *)stack;
 315         stack += SPA_GANGBLOCKSIZE;
 316         offset = DVA_GET_OFFSET(dva);
 317         sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
 318
 319         /* read in the gang block header */
 320         if (devread(sector, 0, SPA_GANGBLOCKSIZE, (char *)zio_gb) == 0) {
 321                 grub_printf("failed to read in a gang block header\n");
 322                 return (1);
 323         }
 324
 325         /* self checksuming the gang block header */
 326         BP_ZERO(&tmpbp);
 327         BP_SET_CHECKSUM(&tmpbp, ZIO_CHECKSUM_GANG_HEADER);
 328         BP_SET_BYTEORDER(&tmpbp, ZFS_HOST_BYTEORDER);
 329         ZIO_SET_CHECKSUM(&tmpbp.blk_cksum, DVA_GET_VDEV(dva),
 330             DVA_GET_OFFSET(dva), bp->blk_birth, 0);
 331         if (zio_checksum_verify(&tmpbp, (char *)zio_gb, SPA_GANGBLOCKSIZE)) {
 332                 grub_printf("failed to checksum a gang block header\n");
 333                 return (1);
 334         }
 335
 336         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
 337                 if (BP_IS_HOLE(&zio_gb->zg_blkptr[i]))
 338                         continue;
 339
 340                 if (zio_read_data(&zio_gb->zg_blkptr[i], buf, stack))
 341                         return (1);
 342                 buf += BP_GET_PSIZE(&zio_gb->zg_blkptr[i]);
 343         }
 344
 345         return (0);
 346 }
 347
 348 /*
 349  * Read in a block of raw data to buf.
 350  *
 351  * Return:
 352  *      0 - success
 353  *      1 - failure
 354  */
 355 static int
 356 zio_read_data(blkptr_t *bp, void *buf, char *stack)
 357 {
 358         int i, psize;
 359
 360         psize = BP_GET_PSIZE(bp);
 361
 362         /* pick a good dva from the block pointer */
 363         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
 364                 uint64_t offset, sector;
 365
 366                 if (bp->blk_dva[i].dva_word[0] == 0 &&
 367                     bp->blk_dva[i].dva_word[1] == 0)
 368                         continue;
 369
 370                 if (DVA_GET_GANG(&bp->blk_dva[i])) {
 371                         if (zio_read_gang(bp, &bp->blk_dva[i], buf, stack) != 0)
 372                                 continue;
 373                 } else {
 374                         /* read in a data block */
 375                         offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 376                         sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
 377                         if (devread(sector, 0, psize, buf) == 0)
 378                                 continue;
 379                 }
 380
 381                 /* verify that the checksum matches */
 382                 if (zio_checksum_verify(bp, buf, psize) == 0) {
 383                         return (0);
 384                 }
 385         }
 386
 387         grub_printf("could not read block due to EIO or ECKSUM\n");
 388         return (1);
 389 }
 390
 391 /*
 392  * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
 393  * more than BPE_PAYLOAD_SIZE bytes).
 394  */
 395 static void
 396 decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
 397 {
 398         int psize, i;
 399         uint8_t *buf8 = buf;
 400         uint64_t w = 0;
 401         const uint64_t *bp64 = (const uint64_t *)bp;
 402
 403         psize = BPE_GET_PSIZE(bp);
 404
 405         /*
 406          * Decode the words of the block pointer into the byte array.
 407          * Low bits of first word are the first byte (little endian).
 408          */
 409         for (i = 0; i < psize; i++) {
 410                 if (i % sizeof (w) == 0) {
 411                         /* beginning of a word */
 412                         w = *bp64;
 413                         bp64++;
 414                         if (!BPE_IS_PAYLOADWORD(bp, bp64))
 415                                 bp64++;
 416                 }
 417                 buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
 418         }
 419 }
 420
 421 /*
 422  * Fill in the buffer with the (decompressed) payload of the embedded
 423  * blkptr_t.  Takes into account compression and byteorder (the payload is
 424  * treated as a stream of bytes).
 425  * Return 0 on success, or ENOSPC if it won't fit in the buffer.
 426  */
 427 static int
 428 decode_embedded_bp(const blkptr_t *bp, void *buf)
 429 {
 430         int comp;
 431         int lsize, psize;
 432         uint8_t *dst = buf;
 433         uint64_t w = 0;
 434
 435         lsize = BPE_GET_LSIZE(bp);
 436         psize = BPE_GET_PSIZE(bp);
 437         comp = BP_GET_COMPRESS(bp);
 438
 439         if (comp != ZIO_COMPRESS_OFF) {
 440                 uint8_t dstbuf[BPE_PAYLOAD_SIZE];
 441
 442                 if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS ||
 443                     decomp_table[comp].decomp_func == NULL) {
 444                         grub_printf("compression algorithm not supported\n");
 445                         return (ERR_FSYS_CORRUPT);
 446                 }
 447
 448                 decode_embedded_bp_compressed(bp, dstbuf);
 449                 decomp_table[comp].decomp_func(dstbuf, buf, psize, lsize);
 450         } else {
 451                 decode_embedded_bp_compressed(bp, buf);
 452         }
 453
 454         return (0);
 455 }
 456
 457 /*
 458  * Read in a block of data, verify its checksum, decompress if needed,
 459  * and put the uncompressed data in buf.
 460  *
 461  * Return:
 462  *      0 - success
 463  *      errnum - failure
 464  */
 465 static int
 466 zio_read(blkptr_t *bp, void *buf, char *stack)
 467 {
 468         int lsize, psize, comp;
 469         char *retbuf;
 470
 471         if (BP_IS_EMBEDDED(bp)) {
 472                 if (BPE_GET_ETYPE(bp) != BP_EMBEDDED_TYPE_DATA) {
 473                         grub_printf("unsupported embedded BP (type=%u)\n",
 474                             (int)BPE_GET_ETYPE(bp));
 475                         return (ERR_FSYS_CORRUPT);
 476                 }
 477                 return (decode_embedded_bp(bp, buf));
 478         }
 479
 480         comp = BP_GET_COMPRESS(bp);
 481         lsize = BP_GET_LSIZE(bp);
 482         psize = BP_GET_PSIZE(bp);
 483
 484         if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS ||
 485             (comp != ZIO_COMPRESS_OFF &&
 486             decomp_table[comp].decomp_func == NULL)) {
 487                 grub_printf("compression algorithm not supported\n");
 488                 return (ERR_FSYS_CORRUPT);
 489         }
 490
 491         if ((char *)buf < stack && ((char *)buf) + lsize > stack) {
 492                 grub_printf("not enough memory to fit %u bytes on stack\n",
 493                     lsize);
 494                 return (ERR_WONT_FIT);
 495         }
 496
 497         retbuf = buf;
 498         if (comp != ZIO_COMPRESS_OFF) {
 499                 buf = stack;
 500                 stack += psize;
 501         }
 502
 503         if (zio_read_data(bp, buf, stack) != 0) {
 504                 grub_printf("zio_read_data failed\n");
 505                 return (ERR_FSYS_CORRUPT);
 506         }
 507
 508         if (comp != ZIO_COMPRESS_OFF) {
 509                 if (decomp_table[comp].decomp_func(buf, retbuf, psize,
 510                     lsize) != 0) {
 511                         grub_printf("zio_read decompression failed\n");
 512                         return (ERR_FSYS_CORRUPT);
 513                 }
 514         }
 515
 516         return (0);
 517 }
 518
 519 /*
 520  * Get the block from a block id.
 521  * push the block onto the stack.
 522  *
 523  * Return:
 524  *      0 - success
 525  *      errnum - failure
 526  */
 527 static int
 528 dmu_read(dnode_phys_t *dn, uint64_t blkid, void *buf, char *stack)
 529 {
 530         int idx, level;
 531         blkptr_t *bp_array = dn->dn_blkptr;
 532         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 533         blkptr_t *bp, *tmpbuf;
 534
 535         bp = (blkptr_t *)stack;
 536         stack += sizeof (blkptr_t);
 537
 538         tmpbuf = (blkptr_t *)stack;
 539         stack += 1<<dn->dn_indblkshift;
 540
 541         for (level = dn->dn_nlevels - 1; level >= 0; level--) {
 542                 idx = (blkid >> (epbs * level)) & ((1<<epbs)-1);
 543                 *bp = bp_array[idx];
 544                 if (level == 0)
 545                         tmpbuf = buf;
 546                 if (BP_IS_HOLE(bp)) {
 547                         grub_memset(buf, 0,
 548                             dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 549                         break;
 550                 } else if (errnum = zio_read(bp, tmpbuf, stack)) {
 551                         return (errnum);
 552                 }
 553
 554                 bp_array = tmpbuf;
 555         }
 556
 557         return (0);
 558 }
 559
 560 /*
 561  * mzap_lookup: Looks up property described by "name" and returns the value
 562  * in "value".
 563  *
 564  * Return:
 565  *      0 - success
 566  *      errnum - failure
 567  */
 568 static int
 569 mzap_lookup(mzap_phys_t *zapobj, int objsize, const char *name,
 570     uint64_t *value)
 571 {
 572         int i, chunks;
 573         mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
 574
 575         chunks = objsize / MZAP_ENT_LEN - 1;
 576         for (i = 0; i < chunks; i++) {
 577                 if (grub_strcmp(mzap_ent[i].mze_name, name) == 0) {
 578                         *value = mzap_ent[i].mze_value;
 579                         return (0);
 580                 }
 581         }
 582
 583         return (ERR_FSYS_CORRUPT);
 584 }
 585
 586 static uint64_t
 587 zap_hash(uint64_t salt, const char *name)
 588 {
 589         static uint64_t table[256];
 590         const uint8_t *cp;
 591         uint8_t c;
 592         uint64_t crc = salt;
 593
 594         if (table[128] == 0) {
 595                 uint64_t *ct;
 596                 int i, j;
 597                 for (i = 0; i < 256; i++) {
 598                         for (ct = table + i, *ct = i, j = 8; j > 0; j--)
 599                                 *ct = (*ct >> 1) ^ (-(*ct & 1) &
 600                                     ZFS_CRC64_POLY);
 601                 }
 602         }
 603
 604         if (crc == 0 || table[128] != ZFS_CRC64_POLY) {
 605                 errnum = ERR_FSYS_CORRUPT;
 606                 return (0);
 607         }
 608
 609         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
 610                 crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF];
 611
 612         /*
 613          * Only use 28 bits, since we need 4 bits in the cookie for the
 614          * collision differentiator.  We MUST use the high bits, since
 615          * those are the ones that we first pay attention to when
 616          * choosing the bucket.
 617          */
 618         crc &= ~((1ULL << (64 - 28)) - 1);
 619
 620         return (crc);
 621 }
 622
 623 /*
 624  * Only to be used on 8-bit arrays.
 625  * array_len is actual len in bytes (not encoded le_value_length).
 626  * buf is null-terminated.
 627  */
 628 static int
 629 zap_leaf_array_equal(zap_leaf_phys_t *l, int blksft, int chunk,
 630     int array_len, const char *buf)
 631 {
 632         int bseen = 0;
 633
 634         while (bseen < array_len) {
 635                 struct zap_leaf_array *la =
 636                     &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
 637                 int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
 638
 639                 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
 640                         return (0);
 641
 642                 if (zfs_bcmp(la->la_array, buf + bseen, toread) != 0)
 643                         break;
 644                 chunk = la->la_next;
 645                 bseen += toread;
 646         }
 647         return (bseen == array_len);
 648 }
 649
 650 /*
 651  * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the
 652  * value for the property "name".
 653  *
 654  * Return:
 655  *      0 - success
 656  *      errnum - failure
 657  */
 658 static int
 659 zap_leaf_lookup(zap_leaf_phys_t *l, int blksft, uint64_t h,
 660     const char *name, uint64_t *value)
 661 {
 662         uint16_t chunk;
 663         struct zap_leaf_entry *le;
 664
 665         /* Verify if this is a valid leaf block */
 666         if (l->l_hdr.lh_block_type != ZBT_LEAF)
 667                 return (ERR_FSYS_CORRUPT);
 668         if (l->l_hdr.lh_magic != ZAP_LEAF_MAGIC)
 669                 return (ERR_FSYS_CORRUPT);
 670
 671         for (chunk = l->l_hash[LEAF_HASH(blksft, h)];
 672             chunk != CHAIN_END; chunk = le->le_next) {
 673
 674                 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
 675                         return (ERR_FSYS_CORRUPT);
 676
 677                 le = ZAP_LEAF_ENTRY(l, blksft, chunk);
 678
 679                 /* Verify the chunk entry */
 680                 if (le->le_type != ZAP_CHUNK_ENTRY)
 681                         return (ERR_FSYS_CORRUPT);
 682
 683                 if (le->le_hash != h)
 684                         continue;
 685
 686                 if (zap_leaf_array_equal(l, blksft, le->le_name_chunk,
 687                     le->le_name_length, name)) {
 688
 689                         struct zap_leaf_array *la;
 690                         uint8_t *ip;
 691
 692                         if (le->le_int_size != 8 || le->le_value_length != 1)
 693                                 return (ERR_FSYS_CORRUPT);
 694
 695                         /* get the uint64_t property value */
 696                         la = &ZAP_LEAF_CHUNK(l, blksft,
 697                             le->le_value_chunk).l_array;
 698                         ip = la->la_array;
 699
 700                         *value = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
 701                             (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
 702                             (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
 703                             (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
 704
 705                         return (0);
 706                 }
 707         }
 708
 709         return (ERR_FSYS_CORRUPT);
 710 }
 711
 712 /*
 713  * Fat ZAP lookup
 714  *
 715  * Return:
 716  *      0 - success
 717  *      errnum - failure
 718  */
 719 static int
 720 fzap_lookup(dnode_phys_t *zap_dnode, zap_phys_t *zap,
 721     const char *name, uint64_t *value, char *stack)
 722 {
 723         zap_leaf_phys_t *l;
 724         uint64_t hash, idx, blkid;
 725         int blksft = zfs_log2(zap_dnode->dn_datablkszsec << DNODE_SHIFT);
 726
 727         /* Verify if this is a fat zap header block */
 728         if (zap->zap_magic != (uint64_t)ZAP_MAGIC ||
 729             zap->zap_flags != 0)
 730                 return (ERR_FSYS_CORRUPT);
 731
 732         hash = zap_hash(zap->zap_salt, name);
 733         if (errnum)
 734                 return (errnum);
 735
 736         /* get block id from index */
 737         if (zap->zap_ptrtbl.zt_numblks != 0) {
 738                 /* external pointer tables not supported */
 739                 return (ERR_FSYS_CORRUPT);
 740         }
 741         idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift);
 742         blkid = ((uint64_t *)zap)[idx + (1<<(blksft-3-1))];
 743
 744         /* Get the leaf block */
 745         l = (zap_leaf_phys_t *)stack;
 746         stack += 1<<blksft;
 747         if ((1<<blksft) < sizeof (zap_leaf_phys_t))
 748                 return (ERR_FSYS_CORRUPT);
 749         if (errnum = dmu_read(zap_dnode, blkid, l, stack))
 750                 return (errnum);
 751
 752         return (zap_leaf_lookup(l, blksft, hash, name, value));
 753 }
 754
 755 /*
 756  * Read in the data of a zap object and find the value for a matching
 757  * property name.
 758  *
 759  * Return:
 760  *      0 - success
 761  *      errnum - failure
 762  */
 763 static int
 764 zap_lookup(dnode_phys_t *zap_dnode, const char *name, uint64_t *val,
 765     char *stack)
 766 {
 767         uint64_t block_type;
 768         int size;
 769         void *zapbuf;
 770
 771         /* Read in the first block of the zap object data. */
 772         zapbuf = stack;
 773         size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 774         stack += size;
 775
 776         if ((errnum = dmu_read(zap_dnode, 0, zapbuf, stack)) != 0)
 777                 return (errnum);
 778
 779         block_type = *((uint64_t *)zapbuf);
 780
 781         if (block_type == ZBT_MICRO) {
 782                 return (mzap_lookup(zapbuf, size, name, val));
 783         } else if (block_type == ZBT_HEADER) {
 784                 /* this is a fat zap */
 785                 return (fzap_lookup(zap_dnode, zapbuf, name,
 786                     val, stack));
 787         }
 788
 789         return (ERR_FSYS_CORRUPT);
 790 }
 791
 792 typedef struct zap_attribute {
 793         int za_integer_length;
 794         uint64_t za_num_integers;
 795         uint64_t za_first_integer;
 796         char *za_name;
 797 } zap_attribute_t;
 798
 799 typedef int (zap_cb_t)(zap_attribute_t *za, void *arg, char *stack);
 800
 801 static int
 802 zap_iterate(dnode_phys_t *zap_dnode, zap_cb_t *cb, void *arg, char *stack)
 803 {
 804         uint32_t size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 805         zap_attribute_t za;
 806         int i;
 807         mzap_phys_t *mzp = (mzap_phys_t *)stack;
 808         stack += size;
 809
 810         if ((errnum = dmu_read(zap_dnode, 0, mzp, stack)) != 0)
 811                 return (errnum);
 812
 813         /*
 814          * Iteration over fatzap objects has not yet been implemented.
 815          * If we encounter a pool in which there are more features for
 816          * read than can fit inside a microzap (i.e., more than 2048
 817          * features for read), we can add support for fatzap iteration.
 818          * For now, fail.
 819          */
 820         if (mzp->mz_block_type != ZBT_MICRO) {
 821                 grub_printf("feature information stored in fatzap, pool "
 822                     "version not supported\n");
 823                 return (1);
 824         }
 825
 826         za.za_integer_length = 8;
 827         za.za_num_integers = 1;
 828         for (i = 0; i < size / MZAP_ENT_LEN - 1; i++) {
 829                 mzap_ent_phys_t *mzep = &mzp->mz_chunk[i];
 830                 int err;
 831
 832                 za.za_first_integer = mzep->mze_value;
 833                 za.za_name = mzep->mze_name;
 834                 err = cb(&za, arg, stack);
 835                 if (err != 0)
 836                         return (err);
 837         }
 838
 839         return (0);
 840 }
 841
 842 /*
 843  * Get the dnode of an object number from the metadnode of an object set.
 844  *
 845  * Input
 846  *      mdn - metadnode to get the object dnode
 847  *      objnum - object number for the object dnode
 848  *      type - if nonzero, object must be of this type
 849  *      buf - data buffer that holds the returning dnode
 850  *      stack - scratch area
 851  *
 852  * Return:
 853  *      0 - success
 854  *      errnum - failure
 855  */
 856 static int
 857 dnode_get(dnode_phys_t *mdn, uint64_t objnum, uint8_t type, dnode_phys_t *buf,
 858     char *stack)
 859 {
 860         uint64_t blkid, blksz; /* the block id this object dnode is in */
 861         int epbs; /* shift of number of dnodes in a block */
 862         int idx; /* index within a block */
 863         dnode_phys_t *dnbuf;
 864
 865         blksz = mdn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 866         epbs = zfs_log2(blksz) - DNODE_SHIFT;
 867         blkid = objnum >> epbs;
 868         idx = objnum & ((1<<epbs)-1);
 869
 870         if (dnode_buf != NULL && dnode_mdn == mdn &&
 871             objnum >= dnode_start && objnum < dnode_end) {
 872                 grub_memmove(buf, &dnode_buf[idx], DNODE_SIZE);
 873                 VERIFY_DN_TYPE(buf, type);
 874                 return (0);
 875         }
 876
 877         if (dnode_buf && blksz == 1<<DNODE_BLOCK_SHIFT) {
 878                 dnbuf = dnode_buf;
 879                 dnode_mdn = mdn;
 880                 dnode_start = blkid << epbs;
 881                 dnode_end = (blkid + 1) << epbs;
 882         } else {
 883                 dnbuf = (dnode_phys_t *)stack;
 884                 stack += blksz;
 885         }
 886
 887         if (errnum = dmu_read(mdn, blkid, (char *)dnbuf, stack))
 888                 return (errnum);
 889
 890         grub_memmove(buf, &dnbuf[idx], DNODE_SIZE);
 891         VERIFY_DN_TYPE(buf, type);
 892
 893         return (0);
 894 }
 895
 896 /*
 897  * Check if this is a special file that resides at the top
 898  * dataset of the pool. Currently this is the GRUB menu,
 899  * boot signature and boot signature backup.
 900  * str starts with '/'.
 901  */
 902 static int
 903 is_top_dataset_file(char *str)
 904 {
 905         char *tptr;
 906
 907         if ((tptr = grub_strstr(str, "menu.lst")) &&
 908             (tptr[8] == '\0' || tptr[8] == ' ') &&
 909             *(tptr-1) == '/')
 910                 return (1);
 911
 912         if (grub_strncmp(str, BOOTSIGN_DIR"/",
 913             grub_strlen(BOOTSIGN_DIR) + 1) == 0)
 914                 return (1);
 915
 916         if (grub_strcmp(str, BOOTSIGN_BACKUP) == 0)
 917                 return (1);
 918
 919         return (0);
 920 }
 921
 922 static int
 923 check_feature(zap_attribute_t *za, void *arg, char *stack)
 924 {
 925         const char **names = arg;
 926         int i;
 927
 928         if (za->za_first_integer == 0)
 929                 return (0);
 930
 931         for (i = 0; names[i] != NULL; i++) {
 932                 if (grub_strcmp(za->za_name, names[i]) == 0) {
 933                         return (0);
 934                 }
 935         }
 936         grub_printf("missing feature for read '%s'\n", za->za_name);
 937         return (ERR_NEWER_VERSION);
 938 }
 939
 940 /*
 941  * Get the file dnode for a given file name where mdn is the meta dnode
 942  * for this ZFS object set. When found, place the file dnode in dn.
 943  * The 'path' argument will be mangled.
 944  *
 945  * Return:
 946  *      0 - success
 947  *      errnum - failure
 948  */
 949 static int
 950 dnode_get_path(dnode_phys_t *mdn, char *path, dnode_phys_t *dn,
 951     char *stack)
 952 {
 953         uint64_t objnum, version;
 954         char *cname, ch;
 955
 956         if (errnum = dnode_get(mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
 957             dn, stack))
 958                 return (errnum);
 959
 960         if (errnum = zap_lookup(dn, ZPL_VERSION_STR, &version, stack))
 961                 return (errnum);
 962         if (version > ZPL_VERSION)
 963                 return (-1);
 964
 965         if (errnum = zap_lookup(dn, ZFS_ROOT_OBJ, &objnum, stack))
 966                 return (errnum);
 967
 968         if (errnum = dnode_get(mdn, objnum, DMU_OT_DIRECTORY_CONTENTS,
 969             dn, stack))
 970                 return (errnum);
 971
 972         /* skip leading slashes */
 973         while (*path == '/')
 974                 path++;
 975
 976         while (*path && !grub_isspace(*path)) {
 977
 978                 /* get the next component name */
 979                 cname = path;
 980                 while (*path && !grub_isspace(*path) && *path != '/')
 981                         path++;
 982                 ch = *path;
 983                 *path = 0;   /* ensure null termination */
 984
 985                 if (errnum = zap_lookup(dn, cname, &objnum, stack))
 986                         return (errnum);
 987
 988                 objnum = ZFS_DIRENT_OBJ(objnum);
 989                 if (errnum = dnode_get(mdn, objnum, 0, dn, stack))
 990                         return (errnum);
 991
 992                 *path = ch;
 993                 while (*path == '/')
 994                         path++;
 995         }
 996
 997         /* We found the dnode for this file. Verify if it is a plain file. */
 998         VERIFY_DN_TYPE(dn, DMU_OT_PLAIN_FILE_CONTENTS);
 999
1000         return (0);
1001 }
1002
1003 /*
1004  * Get the default 'bootfs' property value from the rootpool.
1005  *
1006  * Return:
1007  *      0 - success
1008  *      errnum -failure
1009  */
1010 static int
1011 get_default_bootfsobj(dnode_phys_t *mosmdn, uint64_t *obj, char *stack)
1012 {
1013         uint64_t objnum = 0;
1014         dnode_phys_t *dn = (dnode_phys_t *)stack;
1015         stack += DNODE_SIZE;
1016
1017         if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1018             DMU_OT_OBJECT_DIRECTORY, dn, stack))
1019                 return (errnum);
1020
1021         /*
1022          * find the object number for 'pool_props', and get the dnode
1023          * of the 'pool_props'.
1024          */
1025         if (zap_lookup(dn, DMU_POOL_PROPS, &objnum, stack))
1026                 return (ERR_FILESYSTEM_NOT_FOUND);
1027
1028         if (errnum = dnode_get(mosmdn, objnum, DMU_OT_POOL_PROPS, dn, stack))
1029                 return (errnum);
1030
1031         if (zap_lookup(dn, ZPOOL_PROP_BOOTFS, &objnum, stack))
1032                 return (ERR_FILESYSTEM_NOT_FOUND);
1033
1034         if (!objnum)
1035                 return (ERR_FILESYSTEM_NOT_FOUND);
1036
1037         *obj = objnum;
1038         return (0);
1039 }
1040
1041 /*
1042  * List of pool features that the grub implementation of ZFS supports for
1043  * read. Note that features that are only required for write do not need
1044  * to be listed here since grub opens pools in read-only mode.
1045  *
1046  * When this list is updated the version number in usr/src/grub/capability
1047  * must be incremented to ensure the new grub gets installed.
1048  */
1049 static const char *spa_feature_names[] = {
1050         "org.illumos:lz4_compress",
1051         "com.delphix:hole_birth",
1052         "com.delphix:extensible_dataset",
1053         "com.delphix:embedded_data",
1054         "org.open-zfs:large_blocks",
1055         "org.illumos:sha512",
1056         NULL
1057 };
1058
1059 /*
1060  * Checks whether the MOS features that are active are supported by this
1061  * (GRUB's) implementation of ZFS.
1062  *
1063  * Return:
1064  *      0: Success.
1065  *      errnum: Failure.
1066  */
1067 static int
1068 check_mos_features(dnode_phys_t *mosmdn, char *stack)
1069 {
1070         uint64_t objnum;
1071         dnode_phys_t *dn;
1072         uint8_t error = 0;
1073
1074         dn = (dnode_phys_t *)stack;
1075         stack += DNODE_SIZE;
1076
1077         if ((errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1078             DMU_OT_OBJECT_DIRECTORY, dn, stack)) != 0)
1079                 return (errnum);
1080
1081         /*
1082          * Find the object number for 'features_for_read' and retrieve its
1083          * corresponding dnode. Note that we don't check features_for_write
1084          * because GRUB is not opening the pool for write.
1085          */
1086         if ((errnum = zap_lookup(dn, DMU_POOL_FEATURES_FOR_READ, &objnum,
1087             stack)) != 0)
1088                 return (errnum);
1089
1090         if ((errnum = dnode_get(mosmdn, objnum, DMU_OTN_ZAP_METADATA,
1091             dn, stack)) != 0)
1092                 return (errnum);
1093
1094         return (zap_iterate(dn, check_feature, spa_feature_names, stack));
1095 }
1096
1097 /*
1098  * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname),
1099  * e.g. pool/rootfs, or a given object number (obj), e.g. the object number
1100  * of pool/rootfs.
1101  *
1102  * If no fsname and no obj are given, return the DSL_DIR metadnode.
1103  * If fsname is given, return its metadnode and its matching object number.
1104  * If only obj is given, return the metadnode for this object number.
1105  *
1106  * Return:
1107  *      0 - success
1108  *      errnum - failure
1109  */
1110 static int
1111 get_objset_mdn(dnode_phys_t *mosmdn, char *fsname, uint64_t *obj,
1112     dnode_phys_t *mdn, char *stack)
1113 {
1114         uint64_t objnum, headobj;
1115         char *cname, ch;
1116         blkptr_t *bp;
1117         objset_phys_t *osp;
1118         int issnapshot = 0;
1119         char *snapname;
1120
1121         if (fsname == NULL && obj) {
1122                 headobj = *obj;
1123                 goto skip;
1124         }
1125
1126         if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1127             DMU_OT_OBJECT_DIRECTORY, mdn, stack))
1128                 return (errnum);
1129
1130         if (errnum = zap_lookup(mdn, DMU_POOL_ROOT_DATASET, &objnum,
1131             stack))
1132                 return (errnum);
1133
1134         if (errnum = dnode_get(mosmdn, objnum, 0, mdn, stack))
1135                 return (errnum);
1136
1137         if (fsname == NULL) {
1138                 headobj =
1139                     ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
1140                 goto skip;
1141         }
1142
1143         /* take out the pool name */
1144         while (*fsname && !grub_isspace(*fsname) && *fsname != '/')
1145                 fsname++;
1146
1147         while (*fsname && !grub_isspace(*fsname)) {
1148                 uint64_t childobj;
1149
1150                 while (*fsname == '/')
1151                         fsname++;
1152
1153                 cname = fsname;
1154                 while (*fsname && !grub_isspace(*fsname) && *fsname != '/')
1155                         fsname++;
1156                 ch = *fsname;
1157                 *fsname = 0;
1158
1159                 snapname = cname;
1160                 while (*snapname && !grub_isspace(*snapname) && *snapname !=
1161                     '@')
1162                         snapname++;
1163                 if (*snapname == '@') {
1164                         issnapshot = 1;
1165                         *snapname = 0;
1166                 }
1167                 childobj =
1168                     ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_child_dir_zapobj;
1169                 if (errnum = dnode_get(mosmdn, childobj,
1170                     DMU_OT_DSL_DIR_CHILD_MAP, mdn, stack))
1171                         return (errnum);
1172
1173                 if (zap_lookup(mdn, cname, &objnum, stack))
1174                         return (ERR_FILESYSTEM_NOT_FOUND);
1175
1176                 if (errnum = dnode_get(mosmdn, objnum, 0,
1177                     mdn, stack))
1178                         return (errnum);
1179
1180                 *fsname = ch;
1181                 if (issnapshot)
1182                         *snapname = '@';
1183         }
1184         headobj = ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
1185         if (obj)
1186                 *obj = headobj;
1187
1188 skip:
1189         if (errnum = dnode_get(mosmdn, headobj, 0, mdn, stack))
1190                 return (errnum);
1191         if (issnapshot) {
1192                 uint64_t snapobj;
1193
1194                 snapobj = ((dsl_dataset_phys_t *)DN_BONUS(mdn))->
1195                     ds_snapnames_zapobj;
1196
1197                 if (errnum = dnode_get(mosmdn, snapobj,
1198                     DMU_OT_DSL_DS_SNAP_MAP, mdn, stack))
1199                         return (errnum);
1200                 if (zap_lookup(mdn, snapname + 1, &headobj, stack))
1201                         return (ERR_FILESYSTEM_NOT_FOUND);
1202                 if (errnum = dnode_get(mosmdn, headobj, 0, mdn, stack))
1203                         return (errnum);
1204                 if (obj)
1205                         *obj = headobj;
1206         }
1207
1208         bp = &((dsl_dataset_phys_t *)DN_BONUS(mdn))->ds_bp;
1209         osp = (objset_phys_t *)stack;
1210         stack += sizeof (objset_phys_t);
1211         if (errnum = zio_read(bp, osp, stack))
1212                 return (errnum);
1213
1214         grub_memmove((char *)mdn, (char *)&osp->os_meta_dnode, DNODE_SIZE);
1215
1216         return (0);
1217 }
1218
1219 /*
1220  * For a given XDR packed nvlist, verify the first 4 bytes and move on.
1221  *
1222  * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) :
1223  *
1224  *      encoding method/host endian     (4 bytes)
1225  *      nvl_version                     (4 bytes)
1226  *      nvl_nvflag                      (4 bytes)
1227  *      encoded nvpairs:
1228  *              encoded size of the nvpair      (4 bytes)
1229  *              decoded size of the nvpair      (4 bytes)
1230  *              name string size                (4 bytes)
1231  *              name string data                (sizeof(NV_ALIGN4(string))
1232  *              data type                       (4 bytes)
1233  *              # of elements in the nvpair     (4 bytes)
1234  *              data
1235  *      2 zero's for the last nvpair
1236  *              (end of the entire list)        (8 bytes)
1237  *
1238  * Return:
1239  *      0 - success
1240  *      1 - failure
1241  */
1242 static int
1243 nvlist_unpack(char *nvlist, char **out)
1244 {
1245         /* Verify if the 1st and 2nd byte in the nvlist are valid. */
1246         if (nvlist[0] != NV_ENCODE_XDR || nvlist[1] != HOST_ENDIAN)
1247                 return (1);
1248
1249         *out = nvlist + 4;
1250         return (0);
1251 }
1252
1253 static char *
1254 nvlist_array(char *nvlist, int index)
1255 {
1256         int i, encode_size;
1257
1258         for (i = 0; i < index; i++) {
1259                 /* skip the header, nvl_version, and nvl_nvflag */
1260                 nvlist = nvlist + 4 * 2;
1261
1262                 while (encode_size = BSWAP_32(*(uint32_t *)nvlist))
1263                         nvlist += encode_size; /* goto the next nvpair */
1264
1265                 nvlist = nvlist + 4 * 2; /* skip the ending 2 zeros - 8 bytes */
1266         }
1267
1268         return (nvlist);
1269 }
1270
1271 /*
1272  * The nvlist_next_nvpair() function returns a handle to the next nvpair in the
1273  * list following nvpair. If nvpair is NULL, the first pair is returned. If
1274  * nvpair is the last pair in the nvlist, NULL is returned.
1275  */
1276 static char *
1277 nvlist_next_nvpair(char *nvl, char *nvpair)
1278 {
1279         char *cur, *prev;
1280         int encode_size;
1281
1282         if (nvl == NULL)
1283                 return (NULL);
1284
1285         if (nvpair == NULL) {
1286                 /* skip over nvl_version and nvl_nvflag */
1287                 nvpair = nvl + 4 * 2;
1288         } else {
1289                 /* skip to the next nvpair */
1290                 encode_size = BSWAP_32(*(uint32_t *)nvpair);
1291                 nvpair += encode_size;
1292         }
1293
1294         /* 8 bytes of 0 marks the end of the list */
1295         if (*(uint64_t *)nvpair == 0)
1296                 return (NULL);
1297
1298         return (nvpair);
1299 }
1300
1301 /*
1302  * This function returns 0 on success and 1 on failure. On success, a string
1303  * containing the name of nvpair is saved in buf.
1304  */
1305 static int
1306 nvpair_name(char *nvp, char *buf, int buflen)
1307 {
1308         int len;
1309
1310         /* skip over encode/decode size */
1311         nvp += 4 * 2;
1312
1313         len = BSWAP_32(*(uint32_t *)nvp);
1314         if (buflen < len + 1)
1315                 return (1);
1316
1317         grub_memmove(buf, nvp + 4, len);
1318         buf[len] = '\0';
1319
1320         return (0);
1321 }
1322
1323 /*
1324  * This function retrieves the value of the nvpair in the form of enumerated
1325  * type data_type_t. This is used to determine the appropriate type to pass to
1326  * nvpair_value().
1327  */
1328 static int
1329 nvpair_type(char *nvp)
1330 {
1331         int name_len, type;
1332
1333         /* skip over encode/decode size */
1334         nvp += 4 * 2;
1335
1336         /* skip over name_len */
1337         name_len = BSWAP_32(*(uint32_t *)nvp);
1338         nvp += 4;
1339
1340         /* skip over name */
1341         nvp = nvp + ((name_len + 3) & ~3); /* align */
1342
1343         type = BSWAP_32(*(uint32_t *)nvp);
1344
1345         return (type);
1346 }
1347
1348 static int
1349 nvpair_value(char *nvp, void *val, int valtype, int *nelmp)
1350 {
1351         int name_len, type, slen;
1352         char *strval = val;
1353         uint64_t *intval = val;
1354
1355         /* skip over encode/decode size */
1356         nvp += 4 * 2;
1357
1358         /* skip over name_len */
1359         name_len = BSWAP_32(*(uint32_t *)nvp);
1360         nvp += 4;
1361
1362         /* skip over name */
1363         nvp = nvp + ((name_len + 3) & ~3); /* align */
1364
1365         /* skip over type */
1366         type = BSWAP_32(*(uint32_t *)nvp);
1367         nvp += 4;
1368
1369         if (type == valtype) {
1370                 int nelm;
1371
1372                 nelm = BSWAP_32(*(uint32_t *)nvp);
1373                 if (valtype != DATA_TYPE_BOOLEAN && nelm < 1)
1374                         return (1);
1375                 nvp += 4;
1376
1377                 switch (valtype) {
1378                 case DATA_TYPE_BOOLEAN:
1379                         return (0);
1380
1381                 case DATA_TYPE_STRING:
1382                         slen = BSWAP_32(*(uint32_t *)nvp);
1383                         nvp += 4;
1384                         grub_memmove(strval, nvp, slen);
1385                         strval[slen] = '\0';
1386                         return (0);
1387
1388                 case DATA_TYPE_UINT64:
1389                         *intval = BSWAP_64(*(uint64_t *)nvp);
1390                         return (0);
1391
1392                 case DATA_TYPE_NVLIST:
1393                         *(void **)val = (void *)nvp;
1394                         return (0);
1395
1396                 case DATA_TYPE_NVLIST_ARRAY:
1397                         *(void **)val = (void *)nvp;
1398                         if (nelmp)
1399                                 *nelmp = nelm;
1400                         return (0);
1401                 }
1402         }
1403
1404         return (1);
1405 }
1406
1407 static int
1408 nvlist_lookup_value(char *nvlist, char *name, void *val, int valtype,
1409     int *nelmp)
1410 {
1411         char *nvpair;
1412
1413         for (nvpair = nvlist_next_nvpair(nvlist, NULL);
1414             nvpair != NULL;
1415             nvpair = nvlist_next_nvpair(nvlist, nvpair)) {
1416                 int name_len = BSWAP_32(*(uint32_t *)(nvpair + 4 * 2));
1417                 char *nvp_name = nvpair + 4 * 3;
1418
1419                 if ((grub_strncmp(nvp_name, name, name_len) == 0) &&
1420                     nvpair_type(nvpair) == valtype) {
1421                         return (nvpair_value(nvpair, val, valtype, nelmp));
1422                 }
1423         }
1424         return (1);
1425 }
1426
1427 /*
1428  * Check if this vdev is online and is in a good state.
1429  */
1430 static int
1431 vdev_validate(char *nv)
1432 {
1433         uint64_t ival;
1434
1435         if (nvlist_lookup_value(nv, ZPOOL_CONFIG_OFFLINE, &ival,
1436             DATA_TYPE_UINT64, NULL) == 0 ||
1437             nvlist_lookup_value(nv, ZPOOL_CONFIG_FAULTED, &ival,
1438             DATA_TYPE_UINT64, NULL) == 0 ||
1439             nvlist_lookup_value(nv, ZPOOL_CONFIG_REMOVED, &ival,
1440             DATA_TYPE_UINT64, NULL) == 0)
1441                 return (ERR_DEV_VALUES);
1442
1443         return (0);
1444 }
1445
1446 /*
1447  * Get a valid vdev pathname/devid from the boot device.
1448  * The caller should already allocate MAXPATHLEN memory for bootpath and devid.
1449  */
1450 static int
1451 vdev_get_bootpath(char *nv, uint64_t inguid, char *devid, char *bootpath,
1452     int is_spare)
1453 {
1454         char type[16];
1455
1456         if (nvlist_lookup_value(nv, ZPOOL_CONFIG_TYPE, &type, DATA_TYPE_STRING,
1457             NULL))
1458                 return (ERR_FSYS_CORRUPT);
1459
1460         if (grub_strcmp(type, VDEV_TYPE_DISK) == 0) {
1461                 uint64_t guid;
1462
1463                 if (vdev_validate(nv) != 0)
1464                         return (ERR_NO_BOOTPATH);
1465
1466                 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_GUID,
1467                     &guid, DATA_TYPE_UINT64, NULL) != 0)
1468                         return (ERR_NO_BOOTPATH);
1469
1470                 if (guid != inguid)
1471                         return (ERR_NO_BOOTPATH);
1472
1473                 /* for a spare vdev, pick the disk labeled with "is_spare" */
1474                 if (is_spare) {
1475                         uint64_t spare = 0;
1476                         (void) nvlist_lookup_value(nv, ZPOOL_CONFIG_IS_SPARE,
1477                             &spare, DATA_TYPE_UINT64, NULL);
1478                         if (!spare)
1479                                 return (ERR_NO_BOOTPATH);
1480                 }
1481
1482                 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_PHYS_PATH,
1483                     bootpath, DATA_TYPE_STRING, NULL) != 0)
1484                         bootpath[0] = '\0';
1485
1486                 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_DEVID,
1487                     devid, DATA_TYPE_STRING, NULL) != 0)
1488                         devid[0] = '\0';
1489
1490                 if (grub_strlen(bootpath) >= MAXPATHLEN ||
1491                     grub_strlen(devid) >= MAXPATHLEN)
1492                         return (ERR_WONT_FIT);
1493
1494                 return (0);
1495
1496         } else if (grub_strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
1497             grub_strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
1498             (is_spare = (grub_strcmp(type, VDEV_TYPE_SPARE) == 0))) {
1499                 int nelm, i;
1500                 char *child;
1501
1502                 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_CHILDREN, &child,
1503                     DATA_TYPE_NVLIST_ARRAY, &nelm))
1504                         return (ERR_FSYS_CORRUPT);
1505
1506                 for (i = 0; i < nelm; i++) {
1507                         char *child_i;
1508
1509                         child_i = nvlist_array(child, i);
1510                         if (vdev_get_bootpath(child_i, inguid, devid,
1511                             bootpath, is_spare) == 0)
1512                                 return (0);
1513                 }
1514         }
1515
1516         return (ERR_NO_BOOTPATH);
1517 }
1518
1519 /*
1520  * Check the disk label information and retrieve needed vdev name-value pairs.
1521  *
1522  * Return:
1523  *      0 - success
1524  *      ERR_* - failure
1525  */
1526 static int
1527 check_pool_label(uint64_t sector, char *stack, char *outdevid,
1528     char *outpath, uint64_t *outguid, uint64_t *outashift, uint64_t *outversion)
1529 {
1530         vdev_phys_t *vdev;
1531         uint64_t pool_state, txg = 0;
1532         char *nvlist, *nv, *features;
1533         uint64_t diskguid;
1534
1535         sector += (VDEV_SKIP_SIZE >> SPA_MINBLOCKSHIFT);
1536
1537         /* Read in the vdev name-value pair list (112K). */
1538         if (devread(sector, 0, VDEV_PHYS_SIZE, stack) == 0)
1539                 return (ERR_READ);
1540
1541         vdev = (vdev_phys_t *)stack;
1542         stack += sizeof (vdev_phys_t);
1543
1544         if (nvlist_unpack(vdev->vp_nvlist, &nvlist))
1545                 return (ERR_FSYS_CORRUPT);
1546
1547         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_STATE, &pool_state,
1548             DATA_TYPE_UINT64, NULL))
1549                 return (ERR_FSYS_CORRUPT);
1550
1551         if (pool_state == POOL_STATE_DESTROYED)
1552                 return (ERR_FILESYSTEM_NOT_FOUND);
1553
1554         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_NAME,
1555             current_rootpool, DATA_TYPE_STRING, NULL))
1556                 return (ERR_FSYS_CORRUPT);
1557
1558         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_TXG, &txg,
1559             DATA_TYPE_UINT64, NULL))
1560                 return (ERR_FSYS_CORRUPT);
1561
1562         /* not an active device */
1563         if (txg == 0)
1564                 return (ERR_NO_BOOTPATH);
1565
1566         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VERSION, outversion,
1567             DATA_TYPE_UINT64, NULL))
1568                 return (ERR_FSYS_CORRUPT);
1569         if (!SPA_VERSION_IS_SUPPORTED(*outversion))
1570                 return (ERR_NEWER_VERSION);
1571         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VDEV_TREE, &nv,
1572             DATA_TYPE_NVLIST, NULL))
1573                 return (ERR_FSYS_CORRUPT);
1574         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_GUID, &diskguid,
1575             DATA_TYPE_UINT64, NULL))
1576                 return (ERR_FSYS_CORRUPT);
1577         if (nvlist_lookup_value(nv, ZPOOL_CONFIG_ASHIFT, outashift,
1578             DATA_TYPE_UINT64, NULL) != 0)
1579                 return (ERR_FSYS_CORRUPT);
1580         if (vdev_get_bootpath(nv, diskguid, outdevid, outpath, 0))
1581                 return (ERR_NO_BOOTPATH);
1582         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_GUID, outguid,
1583             DATA_TYPE_UINT64, NULL))
1584                 return (ERR_FSYS_CORRUPT);
1585
1586         if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1587             &features, DATA_TYPE_NVLIST, NULL) == 0) {
1588                 char *nvp;
1589                 char *name = stack;
1590                 stack += MAXNAMELEN;
1591
1592                 for (nvp = nvlist_next_nvpair(features, NULL);
1593                     nvp != NULL;
1594                     nvp = nvlist_next_nvpair(features, nvp)) {
1595                         zap_attribute_t za;
1596
1597                         if (nvpair_name(nvp, name, MAXNAMELEN) != 0)
1598                                 return (ERR_FSYS_CORRUPT);
1599
1600                         za.za_integer_length = 8;
1601                         za.za_num_integers = 1;
1602                         za.za_first_integer = 1;
1603                         za.za_name = name;
1604                         if (check_feature(&za, spa_feature_names, stack) != 0)
1605                                 return (ERR_NEWER_VERSION);
1606                 }
1607         }
1608
1609         return (0);
1610 }
1611
1612 /*
1613  * zfs_mount() locates a valid uberblock of the root pool and read in its MOS
1614  * to the memory address MOS.
1615  *
1616  * Return:
1617  *      1 - success
1618  *      0 - failure
1619  */
1620 int
1621 zfs_mount(void)
1622 {
1623         char *stack, *ub_array;
1624         int label = 0;
1625         uberblock_t *ubbest;
1626         objset_phys_t *osp;
1627         char tmp_bootpath[MAXNAMELEN];
1628         char tmp_devid[MAXNAMELEN];
1629         uint64_t tmp_guid, ashift, version;
1630         uint64_t adjpl = (uint64_t)part_length << SPA_MINBLOCKSHIFT;
1631         int err = errnum; /* preserve previous errnum state */
1632
1633         /* if it's our first time here, zero the best uberblock out */
1634         if (best_drive == 0 && best_part == 0 && find_best_root) {
1635                 grub_memset(&current_uberblock, 0, sizeof (uberblock_t));
1636                 pool_guid = 0;
1637         }
1638
1639         stackbase = ZFS_SCRATCH;
1640         stack = stackbase;
1641         ub_array = stack;
1642         stack += VDEV_UBERBLOCK_RING;
1643
1644         osp = (objset_phys_t *)stack;
1645         stack += sizeof (objset_phys_t);
1646         adjpl = P2ALIGN(adjpl, (uint64_t)sizeof (vdev_label_t));
1647
1648         for (label = 0; label < VDEV_LABELS; label++) {
1649
1650                 /*
1651                  * some eltorito stacks don't give us a size and
1652                  * we end up setting the size to MAXUINT, further
1653                  * some of these devices stop working once a single
1654                  * read past the end has been issued. Checking
1655                  * for a maximum part_length and skipping the backup
1656                  * labels at the end of the slice/partition/device
1657                  * avoids breaking down on such devices.
1658                  */
1659                 if (part_length == MAXUINT && label == 2)
1660                         break;
1661
1662                 uint64_t sector = vdev_label_start(adjpl,
1663                     label) >> SPA_MINBLOCKSHIFT;
1664
1665                 /* Read in the uberblock ring (128K). */
1666                 if (devread(sector  +
1667                     ((VDEV_SKIP_SIZE + VDEV_PHYS_SIZE) >> SPA_MINBLOCKSHIFT),
1668                     0, VDEV_UBERBLOCK_RING, ub_array) == 0)
1669                         continue;
1670
1671                 if (check_pool_label(sector, stack, tmp_devid,
1672                     tmp_bootpath, &tmp_guid, &ashift, &version))
1673                         continue;
1674
1675                 if (pool_guid == 0)
1676                         pool_guid = tmp_guid;
1677
1678                 if ((ubbest = find_bestub(ub_array, ashift, sector)) == NULL ||
1679                     zio_read(&ubbest->ub_rootbp, osp, stack) != 0)
1680                         continue;
1681
1682                 VERIFY_OS_TYPE(osp, DMU_OST_META);
1683
1684                 if (version >= SPA_VERSION_FEATURES &&
1685                     check_mos_features(&osp->os_meta_dnode, stack) != 0)
1686                         continue;
1687
1688                 if (find_best_root && ((pool_guid != tmp_guid) ||
1689                     vdev_uberblock_compare(ubbest, &(current_uberblock)) <= 0))
1690                         continue;
1691
1692                 /* Got the MOS. Save it at the memory addr MOS. */
1693                 grub_memmove(MOS, &osp->os_meta_dnode, DNODE_SIZE);
1694                 grub_memmove(&current_uberblock, ubbest, sizeof (uberblock_t));
1695                 grub_memmove(current_bootpath, tmp_bootpath, MAXNAMELEN);
1696                 grub_memmove(current_devid, tmp_devid, grub_strlen(tmp_devid));
1697                 is_zfs_mount = 1;
1698                 return (1);
1699         }
1700
1701         /*
1702          * While some fs impls. (tftp) rely on setting and keeping
1703          * global errnums set, others won't reset it and will break
1704          * when issuing rawreads. The goal here is to simply not
1705          * have zfs mount attempts impact the previous state.
1706          */
1707         errnum = err;
1708         return (0);
1709 }
1710
1711 /*
1712  * zfs_open() locates a file in the rootpool by following the
1713  * MOS and places the dnode of the file in the memory address DNODE.
1714  *
1715  * Return:
1716  *      1 - success
1717  *      0 - failure
1718  */
1719 int
1720 zfs_open(char *filename)
1721 {
1722         char *stack;
1723         dnode_phys_t *mdn;
1724
1725         file_buf = NULL;
1726         stackbase = ZFS_SCRATCH;
1727         stack = stackbase;
1728
1729         mdn = (dnode_phys_t *)stack;
1730         stack += sizeof (dnode_phys_t);
1731
1732         dnode_mdn = NULL;
1733         dnode_buf = (dnode_phys_t *)stack;
1734         stack += 1<<DNODE_BLOCK_SHIFT;
1735
1736         /*
1737          * menu.lst is placed at the root pool filesystem level,
1738          * do not goto 'current_bootfs'.
1739          */
1740         if (is_top_dataset_file(filename)) {
1741                 if (errnum = get_objset_mdn(MOS, NULL, NULL, mdn, stack))
1742                         return (0);
1743
1744                 current_bootfs_obj = 0;
1745         } else {
1746                 if (current_bootfs[0] == '\0') {
1747                         /* Get the default root filesystem object number */
1748                         if (errnum = get_default_bootfsobj(MOS,
1749                             &current_bootfs_obj, stack))
1750                                 return (0);
1751
1752                         if (errnum = get_objset_mdn(MOS, NULL,
1753                             &current_bootfs_obj, mdn, stack))
1754                                 return (0);
1755                 } else {
1756                         if (errnum = get_objset_mdn(MOS, current_bootfs,
1757                             &current_bootfs_obj, mdn, stack)) {
1758                                 grub_memset(current_bootfs, 0, MAXNAMELEN);
1759                                 return (0);
1760                         }
1761                 }
1762         }
1763
1764         if (dnode_get_path(mdn, filename, DNODE, stack)) {
1765                 errnum = ERR_FILE_NOT_FOUND;
1766                 return (0);
1767         }
1768
1769         /* get the file size and set the file position to 0 */
1770
1771         /*
1772          * For DMU_OT_SA we will need to locate the SIZE attribute
1773          * attribute, which could be either in the bonus buffer
1774          * or the "spill" block.
1775          */
1776         if (DNODE->dn_bonustype == DMU_OT_SA) {
1777                 sa_hdr_phys_t *sahdrp;
1778                 int hdrsize;
1779
1780                 if (DNODE->dn_bonuslen != 0) {
1781                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE);
1782                 } else {
1783                         if (DNODE->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
1784                                 blkptr_t *bp = &DNODE->dn_spill;
1785                                 void *buf;
1786
1787                                 buf = (void *)stack;
1788                                 stack += BP_GET_LSIZE(bp);
1789
1790                                 /* reset errnum to rawread() failure */
1791                                 errnum = 0;
1792                                 if (zio_read(bp, buf, stack) != 0) {
1793                                         return (0);
1794                                 }
1795                                 sahdrp = buf;
1796                         } else {
1797                                 errnum = ERR_FSYS_CORRUPT;
1798                                 return (0);
1799                         }
1800                 }
1801                 hdrsize = SA_HDR_SIZE(sahdrp);
1802                 filemax = *(uint64_t *)((char *)sahdrp + hdrsize +
1803                     SA_SIZE_OFFSET);
1804         } else {
1805                 filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size;
1806         }
1807         filepos = 0;
1808
1809         dnode_buf = NULL;
1810         return (1);
1811 }
1812
1813 /*
1814  * zfs_read reads in the data blocks pointed by the DNODE.
1815  *
1816  * Return:
1817  *      len - the length successfully read in to the buffer
1818  *      0   - failure
1819  */
1820 int
1821 zfs_read(char *buf, int len)
1822 {
1823         char *stack;
1824         int blksz, length, movesize;
1825
1826         if (file_buf == NULL) {
1827                 file_buf = stackbase;
1828                 stackbase += SPA_MAXBLOCKSIZE;
1829                 file_start = file_end = 0;
1830         }
1831         stack = stackbase;
1832
1833         /*
1834          * If offset is in memory, move it into the buffer provided and return.
1835          */
1836         if (filepos >= file_start && filepos+len <= file_end) {
1837                 grub_memmove(buf, file_buf + filepos - file_start, len);
1838                 filepos += len;
1839                 return (len);
1840         }
1841
1842         blksz = DNODE->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1843
1844         /*
1845          * Note: for GRUB, SPA_MAXBLOCKSIZE is 128KB.  There is not enough
1846          * memory to allocate the new max blocksize (16MB), so while
1847          * GRUB understands the large_blocks on-disk feature, it can't
1848          * actually read large blocks.
1849          */
1850         if (blksz > SPA_MAXBLOCKSIZE) {
1851                 grub_printf("blocks larger than 128K are not supported\n");
1852                 return (0);
1853         }
1854
1855         /*
1856          * Entire Dnode is too big to fit into the space available.  We
1857          * will need to read it in chunks.  This could be optimized to
1858          * read in as large a chunk as there is space available, but for
1859          * now, this only reads in one data block at a time.
1860          */
1861         length = len;
1862         while (length) {
1863                 /*
1864                  * Find requested blkid and the offset within that block.
1865                  */
1866                 uint64_t blkid = filepos / blksz;
1867
1868                 if (errnum = dmu_read(DNODE, blkid, file_buf, stack))
1869                         return (0);
1870
1871                 file_start = blkid * blksz;
1872                 file_end = file_start + blksz;
1873
1874                 movesize = MIN(length, file_end - filepos);
1875
1876                 grub_memmove(buf, file_buf + filepos - file_start,
1877                     movesize);
1878                 buf += movesize;
1879                 length -= movesize;
1880                 filepos += movesize;
1881         }
1882
1883         return (len);
1884 }
1885
1886 /*
1887  * No-Op
1888  */
1889 int
1890 zfs_embed(int *start_sector, int needed_sectors)
1891 {
1892         return (1);
1893 }
1894
1895 #endif /* FSYS_ZFS */