6328 Fix cstyle errors in zfs codebase
[unleashed.git] / usr / src / grub / grub-0.97 / stage2 / fsys_zfs.c
blobf9bc6fda49463ef5e2ca8a6c0d58da16932e425c
1 /*
2 * GRUB -- GRand Unified Bootloader
3 * Copyright (C) 1999,2000,2001,2002,2003,2004 Free Software Foundation, Inc.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
22 * Use is subject to license terms.
26 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
27 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
31 * The zfs plug-in routines for GRUB are:
33 * zfs_mount() - locates a valid uberblock of the root pool and reads
34 * in its MOS at the memory address MOS.
36 * zfs_open() - locates a plain file object by following the MOS
37 * and places its dnode at the memory address DNODE.
39 * zfs_read() - read in the data blocks pointed by the DNODE.
41 * ZFS_SCRATCH is used as a working area.
43 * (memory addr) MOS DNODE ZFS_SCRATCH
44 * | | |
45 * +-------V---------V----------V---------------+
46 * memory | | dnode | dnode | scratch |
47 * | | 512B | 512B | area |
48 * +--------------------------------------------+
51 #ifdef FSYS_ZFS
53 #include "shared.h"
54 #include "filesys.h"
55 #include "fsys_zfs.h"
57 /* cache for a file block of the currently zfs_open()-ed file */
58 static void *file_buf = NULL;
59 static uint64_t file_start = 0;
60 static uint64_t file_end = 0;
62 /* cache for a dnode block */
63 static dnode_phys_t *dnode_buf = NULL;
64 static dnode_phys_t *dnode_mdn = NULL;
65 static uint64_t dnode_start = 0;
66 static uint64_t dnode_end = 0;
68 static uint64_t pool_guid = 0;
69 static uberblock_t current_uberblock;
70 static char *stackbase;
72 decomp_entry_t decomp_table[ZIO_COMPRESS_FUNCTIONS] =
74 {"inherit", 0}, /* ZIO_COMPRESS_INHERIT */
75 {"on", lzjb_decompress}, /* ZIO_COMPRESS_ON */
76 {"off", 0}, /* ZIO_COMPRESS_OFF */
77 {"lzjb", lzjb_decompress}, /* ZIO_COMPRESS_LZJB */
78 {"empty", 0}, /* ZIO_COMPRESS_EMPTY */
79 {"gzip-1", 0}, /* ZIO_COMPRESS_GZIP_1 */
80 {"gzip-2", 0}, /* ZIO_COMPRESS_GZIP_2 */
81 {"gzip-3", 0}, /* ZIO_COMPRESS_GZIP_3 */
82 {"gzip-4", 0}, /* ZIO_COMPRESS_GZIP_4 */
83 {"gzip-5", 0}, /* ZIO_COMPRESS_GZIP_5 */
84 {"gzip-6", 0}, /* ZIO_COMPRESS_GZIP_6 */
85 {"gzip-7", 0}, /* ZIO_COMPRESS_GZIP_7 */
86 {"gzip-8", 0}, /* ZIO_COMPRESS_GZIP_8 */
87 {"gzip-9", 0}, /* ZIO_COMPRESS_GZIP_9 */
88 {"zle", 0}, /* ZIO_COMPRESS_ZLE */
89 {"lz4", lz4_decompress} /* ZIO_COMPRESS_LZ4 */
92 static int zio_read_data(blkptr_t *bp, void *buf, char *stack);
95 * Our own version of bcmp().
97 static int
98 zfs_bcmp(const void *s1, const void *s2, size_t n)
100 const uchar_t *ps1 = s1;
101 const uchar_t *ps2 = s2;
103 if (s1 != s2 && n != 0) {
104 do {
105 if (*ps1++ != *ps2++)
106 return (1);
107 } while (--n != 0);
110 return (0);
114 * Our own version of log2(). Same thing as highbit()-1.
116 static int
117 zfs_log2(uint64_t num)
119 int i = 0;
121 while (num > 1) {
122 i++;
123 num = num >> 1;
126 return (i);
129 /* Checksum Functions */
130 static void
131 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
133 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
136 /* Checksum Table and Values */
137 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
138 {{NULL, NULL}, 0, 0, "inherit"},
139 {{NULL, NULL}, 0, 0, "on"},
140 {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"},
141 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"},
142 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"},
143 {{NULL, NULL}, 0, 0, "zilog"},
144 {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"},
145 {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"},
146 {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"},
147 {{NULL, NULL}, 0, 0, "zilog2"},
148 {{zio_checksum_off, zio_checksum_off}, 0, 0, "noparity"},
149 {{zio_checksum_SHA512, NULL}, 0, 0, "SHA512"}
153 * zio_checksum_verify: Provides support for checksum verification.
155 * Fletcher2, Fletcher4, SHA-256 and SHA-512/256 are supported.
157 * Return:
158 * -1 = Failure
159 * 0 = Success
161 static int
162 zio_checksum_verify(blkptr_t *bp, char *data, int size)
164 zio_cksum_t zc = bp->blk_cksum;
165 uint32_t checksum = BP_GET_CHECKSUM(bp);
166 int byteswap = BP_SHOULD_BYTESWAP(bp);
167 zio_eck_t *zec = (zio_eck_t *)(data + size) - 1;
168 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
169 zio_cksum_t actual_cksum, expected_cksum;
171 if (byteswap) {
172 grub_printf("byteswap not supported\n");
173 return (-1);
176 if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) {
177 grub_printf("checksum algorithm %u not supported\n", checksum);
178 return (-1);
181 if (ci->ci_eck) {
182 expected_cksum = zec->zec_cksum;
183 zec->zec_cksum = zc;
184 ci->ci_func[0](data, size, &actual_cksum);
185 zec->zec_cksum = expected_cksum;
186 zc = expected_cksum;
187 } else {
188 ci->ci_func[byteswap](data, size, &actual_cksum);
191 if ((actual_cksum.zc_word[0] - zc.zc_word[0]) |
192 (actual_cksum.zc_word[1] - zc.zc_word[1]) |
193 (actual_cksum.zc_word[2] - zc.zc_word[2]) |
194 (actual_cksum.zc_word[3] - zc.zc_word[3]))
195 return (-1);
197 return (0);
201 * vdev_label_start returns the physical disk offset (in bytes) of
202 * label "l".
204 static uint64_t
205 vdev_label_start(uint64_t psize, int l)
207 return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
208 0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
212 * vdev_uberblock_compare takes two uberblock structures and returns an integer
213 * indicating the more recent of the two.
214 * Return Value = 1 if ub2 is more recent
215 * Return Value = -1 if ub1 is more recent
216 * The most recent uberblock is determined using its transaction number and
217 * timestamp. The uberblock with the highest transaction number is
218 * considered "newer". If the transaction numbers of the two blocks match, the
219 * timestamps are compared to determine the "newer" of the two.
221 static int
222 vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
224 if (ub1->ub_txg < ub2->ub_txg)
225 return (-1);
226 if (ub1->ub_txg > ub2->ub_txg)
227 return (1);
229 if (ub1->ub_timestamp < ub2->ub_timestamp)
230 return (-1);
231 if (ub1->ub_timestamp > ub2->ub_timestamp)
232 return (1);
234 return (0);
238 * Three pieces of information are needed to verify an uberblock: the magic
239 * number, the version number, and the checksum.
241 * Return:
242 * 0 - Success
243 * -1 - Failure
245 static int
246 uberblock_verify(uberblock_t *uber, uint64_t ub_size, uint64_t offset)
248 blkptr_t bp;
250 BP_ZERO(&bp);
251 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
252 BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
253 ZIO_SET_CHECKSUM(&bp.blk_cksum, offset, 0, 0, 0);
255 if (zio_checksum_verify(&bp, (char *)uber, ub_size) != 0)
256 return (-1);
258 if (uber->ub_magic == UBERBLOCK_MAGIC &&
259 SPA_VERSION_IS_SUPPORTED(uber->ub_version))
260 return (0);
262 return (-1);
266 * Find the best uberblock.
267 * Return:
268 * Success - Pointer to the best uberblock.
269 * Failure - NULL
271 static uberblock_t *
272 find_bestub(char *ub_array, uint64_t ashift, uint64_t sector)
274 uberblock_t *ubbest = NULL;
275 uberblock_t *ubnext;
276 uint64_t offset, ub_size;
277 int i;
279 ub_size = VDEV_UBERBLOCK_SIZE(ashift);
281 for (i = 0; i < VDEV_UBERBLOCK_COUNT(ashift); i++) {
282 ubnext = (uberblock_t *)ub_array;
283 ub_array += ub_size;
284 offset = (sector << SPA_MINBLOCKSHIFT) +
285 VDEV_UBERBLOCK_OFFSET(ashift, i);
287 if (uberblock_verify(ubnext, ub_size, offset) != 0)
288 continue;
290 if (ubbest == NULL ||
291 vdev_uberblock_compare(ubnext, ubbest) > 0)
292 ubbest = ubnext;
295 return (ubbest);
299 * Read a block of data based on the gang block address dva,
300 * and put its data in buf.
302 * Return:
303 * 0 - success
304 * 1 - failure
306 static int
307 zio_read_gang(blkptr_t *bp, dva_t *dva, void *buf, char *stack)
309 zio_gbh_phys_t *zio_gb;
310 uint64_t offset, sector;
311 blkptr_t tmpbp;
312 int i;
314 zio_gb = (zio_gbh_phys_t *)stack;
315 stack += SPA_GANGBLOCKSIZE;
316 offset = DVA_GET_OFFSET(dva);
317 sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
319 /* read in the gang block header */
320 if (devread(sector, 0, SPA_GANGBLOCKSIZE, (char *)zio_gb) == 0) {
321 grub_printf("failed to read in a gang block header\n");
322 return (1);
325 /* self checksuming the gang block header */
326 BP_ZERO(&tmpbp);
327 BP_SET_CHECKSUM(&tmpbp, ZIO_CHECKSUM_GANG_HEADER);
328 BP_SET_BYTEORDER(&tmpbp, ZFS_HOST_BYTEORDER);
329 ZIO_SET_CHECKSUM(&tmpbp.blk_cksum, DVA_GET_VDEV(dva),
330 DVA_GET_OFFSET(dva), bp->blk_birth, 0);
331 if (zio_checksum_verify(&tmpbp, (char *)zio_gb, SPA_GANGBLOCKSIZE)) {
332 grub_printf("failed to checksum a gang block header\n");
333 return (1);
336 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
337 if (BP_IS_HOLE(&zio_gb->zg_blkptr[i]))
338 continue;
340 if (zio_read_data(&zio_gb->zg_blkptr[i], buf, stack))
341 return (1);
342 buf += BP_GET_PSIZE(&zio_gb->zg_blkptr[i]);
345 return (0);
349 * Read in a block of raw data to buf.
351 * Return:
352 * 0 - success
353 * 1 - failure
355 static int
356 zio_read_data(blkptr_t *bp, void *buf, char *stack)
358 int i, psize;
360 psize = BP_GET_PSIZE(bp);
362 /* pick a good dva from the block pointer */
363 for (i = 0; i < SPA_DVAS_PER_BP; i++) {
364 uint64_t offset, sector;
366 if (bp->blk_dva[i].dva_word[0] == 0 &&
367 bp->blk_dva[i].dva_word[1] == 0)
368 continue;
370 if (DVA_GET_GANG(&bp->blk_dva[i])) {
371 if (zio_read_gang(bp, &bp->blk_dva[i], buf, stack) != 0)
372 continue;
373 } else {
374 /* read in a data block */
375 offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
376 sector = DVA_OFFSET_TO_PHYS_SECTOR(offset);
377 if (devread(sector, 0, psize, buf) == 0)
378 continue;
381 /* verify that the checksum matches */
382 if (zio_checksum_verify(bp, buf, psize) == 0) {
383 return (0);
387 grub_printf("could not read block due to EIO or ECKSUM\n");
388 return (1);
392 * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
393 * more than BPE_PAYLOAD_SIZE bytes).
395 static void
396 decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
398 int psize, i;
399 uint8_t *buf8 = buf;
400 uint64_t w = 0;
401 const uint64_t *bp64 = (const uint64_t *)bp;
403 psize = BPE_GET_PSIZE(bp);
406 * Decode the words of the block pointer into the byte array.
407 * Low bits of first word are the first byte (little endian).
409 for (i = 0; i < psize; i++) {
410 if (i % sizeof (w) == 0) {
411 /* beginning of a word */
412 w = *bp64;
413 bp64++;
414 if (!BPE_IS_PAYLOADWORD(bp, bp64))
415 bp64++;
417 buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
422 * Fill in the buffer with the (decompressed) payload of the embedded
423 * blkptr_t. Takes into account compression and byteorder (the payload is
424 * treated as a stream of bytes).
425 * Return 0 on success, or ENOSPC if it won't fit in the buffer.
427 static int
428 decode_embedded_bp(const blkptr_t *bp, void *buf)
430 int comp;
431 int lsize, psize;
432 uint8_t *dst = buf;
433 uint64_t w = 0;
435 lsize = BPE_GET_LSIZE(bp);
436 psize = BPE_GET_PSIZE(bp);
437 comp = BP_GET_COMPRESS(bp);
439 if (comp != ZIO_COMPRESS_OFF) {
440 uint8_t dstbuf[BPE_PAYLOAD_SIZE];
442 if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS ||
443 decomp_table[comp].decomp_func == NULL) {
444 grub_printf("compression algorithm not supported\n");
445 return (ERR_FSYS_CORRUPT);
448 decode_embedded_bp_compressed(bp, dstbuf);
449 decomp_table[comp].decomp_func(dstbuf, buf, psize, lsize);
450 } else {
451 decode_embedded_bp_compressed(bp, buf);
454 return (0);
458 * Read in a block of data, verify its checksum, decompress if needed,
459 * and put the uncompressed data in buf.
461 * Return:
462 * 0 - success
463 * errnum - failure
465 static int
466 zio_read(blkptr_t *bp, void *buf, char *stack)
468 int lsize, psize, comp;
469 char *retbuf;
471 if (BP_IS_EMBEDDED(bp)) {
472 if (BPE_GET_ETYPE(bp) != BP_EMBEDDED_TYPE_DATA) {
473 grub_printf("unsupported embedded BP (type=%u)\n",
474 (int)BPE_GET_ETYPE(bp));
475 return (ERR_FSYS_CORRUPT);
477 return (decode_embedded_bp(bp, buf));
480 comp = BP_GET_COMPRESS(bp);
481 lsize = BP_GET_LSIZE(bp);
482 psize = BP_GET_PSIZE(bp);
484 if ((unsigned int)comp >= ZIO_COMPRESS_FUNCTIONS ||
485 (comp != ZIO_COMPRESS_OFF &&
486 decomp_table[comp].decomp_func == NULL)) {
487 grub_printf("compression algorithm not supported\n");
488 return (ERR_FSYS_CORRUPT);
491 if ((char *)buf < stack && ((char *)buf) + lsize > stack) {
492 grub_printf("not enough memory to fit %u bytes on stack\n",
493 lsize);
494 return (ERR_WONT_FIT);
497 retbuf = buf;
498 if (comp != ZIO_COMPRESS_OFF) {
499 buf = stack;
500 stack += psize;
503 if (zio_read_data(bp, buf, stack) != 0) {
504 grub_printf("zio_read_data failed\n");
505 return (ERR_FSYS_CORRUPT);
508 if (comp != ZIO_COMPRESS_OFF) {
509 if (decomp_table[comp].decomp_func(buf, retbuf, psize,
510 lsize) != 0) {
511 grub_printf("zio_read decompression failed\n");
512 return (ERR_FSYS_CORRUPT);
516 return (0);
520 * Get the block from a block id.
521 * push the block onto the stack.
523 * Return:
524 * 0 - success
525 * errnum - failure
527 static int
528 dmu_read(dnode_phys_t *dn, uint64_t blkid, void *buf, char *stack)
530 int idx, level;
531 blkptr_t *bp_array = dn->dn_blkptr;
532 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
533 blkptr_t *bp, *tmpbuf;
535 bp = (blkptr_t *)stack;
536 stack += sizeof (blkptr_t);
538 tmpbuf = (blkptr_t *)stack;
539 stack += 1<<dn->dn_indblkshift;
541 for (level = dn->dn_nlevels - 1; level >= 0; level--) {
542 idx = (blkid >> (epbs * level)) & ((1<<epbs)-1);
543 *bp = bp_array[idx];
544 if (level == 0)
545 tmpbuf = buf;
546 if (BP_IS_HOLE(bp)) {
547 grub_memset(buf, 0,
548 dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
549 break;
550 } else if (errnum = zio_read(bp, tmpbuf, stack)) {
551 return (errnum);
554 bp_array = tmpbuf;
557 return (0);
561 * mzap_lookup: Looks up property described by "name" and returns the value
562 * in "value".
564 * Return:
565 * 0 - success
566 * errnum - failure
568 static int
569 mzap_lookup(mzap_phys_t *zapobj, int objsize, const char *name,
570 uint64_t *value)
572 int i, chunks;
573 mzap_ent_phys_t *mzap_ent = zapobj->mz_chunk;
575 chunks = objsize / MZAP_ENT_LEN - 1;
576 for (i = 0; i < chunks; i++) {
577 if (grub_strcmp(mzap_ent[i].mze_name, name) == 0) {
578 *value = mzap_ent[i].mze_value;
579 return (0);
583 return (ERR_FSYS_CORRUPT);
586 static uint64_t
587 zap_hash(uint64_t salt, const char *name)
589 static uint64_t table[256];
590 const uint8_t *cp;
591 uint8_t c;
592 uint64_t crc = salt;
594 if (table[128] == 0) {
595 uint64_t *ct;
596 int i, j;
597 for (i = 0; i < 256; i++) {
598 for (ct = table + i, *ct = i, j = 8; j > 0; j--)
599 *ct = (*ct >> 1) ^ (-(*ct & 1) &
600 ZFS_CRC64_POLY);
604 if (crc == 0 || table[128] != ZFS_CRC64_POLY) {
605 errnum = ERR_FSYS_CORRUPT;
606 return (0);
609 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
610 crc = (crc >> 8) ^ table[(crc ^ c) & 0xFF];
613 * Only use 28 bits, since we need 4 bits in the cookie for the
614 * collision differentiator. We MUST use the high bits, since
615 * those are the ones that we first pay attention to when
616 * choosing the bucket.
618 crc &= ~((1ULL << (64 - 28)) - 1);
620 return (crc);
624 * Only to be used on 8-bit arrays.
625 * array_len is actual len in bytes (not encoded le_value_length).
626 * buf is null-terminated.
628 static int
629 zap_leaf_array_equal(zap_leaf_phys_t *l, int blksft, int chunk,
630 int array_len, const char *buf)
632 int bseen = 0;
634 while (bseen < array_len) {
635 struct zap_leaf_array *la =
636 &ZAP_LEAF_CHUNK(l, blksft, chunk).l_array;
637 int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
639 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
640 return (0);
642 if (zfs_bcmp(la->la_array, buf + bseen, toread) != 0)
643 break;
644 chunk = la->la_next;
645 bseen += toread;
647 return (bseen == array_len);
651 * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the
652 * value for the property "name".
654 * Return:
655 * 0 - success
656 * errnum - failure
658 static int
659 zap_leaf_lookup(zap_leaf_phys_t *l, int blksft, uint64_t h,
660 const char *name, uint64_t *value)
662 uint16_t chunk;
663 struct zap_leaf_entry *le;
665 /* Verify if this is a valid leaf block */
666 if (l->l_hdr.lh_block_type != ZBT_LEAF)
667 return (ERR_FSYS_CORRUPT);
668 if (l->l_hdr.lh_magic != ZAP_LEAF_MAGIC)
669 return (ERR_FSYS_CORRUPT);
671 for (chunk = l->l_hash[LEAF_HASH(blksft, h)];
672 chunk != CHAIN_END; chunk = le->le_next) {
674 if (chunk >= ZAP_LEAF_NUMCHUNKS(blksft))
675 return (ERR_FSYS_CORRUPT);
677 le = ZAP_LEAF_ENTRY(l, blksft, chunk);
679 /* Verify the chunk entry */
680 if (le->le_type != ZAP_CHUNK_ENTRY)
681 return (ERR_FSYS_CORRUPT);
683 if (le->le_hash != h)
684 continue;
686 if (zap_leaf_array_equal(l, blksft, le->le_name_chunk,
687 le->le_name_length, name)) {
689 struct zap_leaf_array *la;
690 uint8_t *ip;
692 if (le->le_int_size != 8 || le->le_value_length != 1)
693 return (ERR_FSYS_CORRUPT);
695 /* get the uint64_t property value */
696 la = &ZAP_LEAF_CHUNK(l, blksft,
697 le->le_value_chunk).l_array;
698 ip = la->la_array;
700 *value = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
701 (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
702 (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
703 (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
705 return (0);
709 return (ERR_FSYS_CORRUPT);
713 * Fat ZAP lookup
715 * Return:
716 * 0 - success
717 * errnum - failure
719 static int
720 fzap_lookup(dnode_phys_t *zap_dnode, zap_phys_t *zap,
721 const char *name, uint64_t *value, char *stack)
723 zap_leaf_phys_t *l;
724 uint64_t hash, idx, blkid;
725 int blksft = zfs_log2(zap_dnode->dn_datablkszsec << DNODE_SHIFT);
727 /* Verify if this is a fat zap header block */
728 if (zap->zap_magic != (uint64_t)ZAP_MAGIC ||
729 zap->zap_flags != 0)
730 return (ERR_FSYS_CORRUPT);
732 hash = zap_hash(zap->zap_salt, name);
733 if (errnum)
734 return (errnum);
736 /* get block id from index */
737 if (zap->zap_ptrtbl.zt_numblks != 0) {
738 /* external pointer tables not supported */
739 return (ERR_FSYS_CORRUPT);
741 idx = ZAP_HASH_IDX(hash, zap->zap_ptrtbl.zt_shift);
742 blkid = ((uint64_t *)zap)[idx + (1<<(blksft-3-1))];
744 /* Get the leaf block */
745 l = (zap_leaf_phys_t *)stack;
746 stack += 1<<blksft;
747 if ((1<<blksft) < sizeof (zap_leaf_phys_t))
748 return (ERR_FSYS_CORRUPT);
749 if (errnum = dmu_read(zap_dnode, blkid, l, stack))
750 return (errnum);
752 return (zap_leaf_lookup(l, blksft, hash, name, value));
756 * Read in the data of a zap object and find the value for a matching
757 * property name.
759 * Return:
760 * 0 - success
761 * errnum - failure
763 static int
764 zap_lookup(dnode_phys_t *zap_dnode, const char *name, uint64_t *val,
765 char *stack)
767 uint64_t block_type;
768 int size;
769 void *zapbuf;
771 /* Read in the first block of the zap object data. */
772 zapbuf = stack;
773 size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
774 stack += size;
776 if ((errnum = dmu_read(zap_dnode, 0, zapbuf, stack)) != 0)
777 return (errnum);
779 block_type = *((uint64_t *)zapbuf);
781 if (block_type == ZBT_MICRO) {
782 return (mzap_lookup(zapbuf, size, name, val));
783 } else if (block_type == ZBT_HEADER) {
784 /* this is a fat zap */
785 return (fzap_lookup(zap_dnode, zapbuf, name,
786 val, stack));
789 return (ERR_FSYS_CORRUPT);
792 typedef struct zap_attribute {
793 int za_integer_length;
794 uint64_t za_num_integers;
795 uint64_t za_first_integer;
796 char *za_name;
797 } zap_attribute_t;
799 typedef int (zap_cb_t)(zap_attribute_t *za, void *arg, char *stack);
801 static int
802 zap_iterate(dnode_phys_t *zap_dnode, zap_cb_t *cb, void *arg, char *stack)
804 uint32_t size = zap_dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
805 zap_attribute_t za;
806 int i;
807 mzap_phys_t *mzp = (mzap_phys_t *)stack;
808 stack += size;
810 if ((errnum = dmu_read(zap_dnode, 0, mzp, stack)) != 0)
811 return (errnum);
814 * Iteration over fatzap objects has not yet been implemented.
815 * If we encounter a pool in which there are more features for
816 * read than can fit inside a microzap (i.e., more than 2048
817 * features for read), we can add support for fatzap iteration.
818 * For now, fail.
820 if (mzp->mz_block_type != ZBT_MICRO) {
821 grub_printf("feature information stored in fatzap, pool "
822 "version not supported\n");
823 return (1);
826 za.za_integer_length = 8;
827 za.za_num_integers = 1;
828 for (i = 0; i < size / MZAP_ENT_LEN - 1; i++) {
829 mzap_ent_phys_t *mzep = &mzp->mz_chunk[i];
830 int err;
832 za.za_first_integer = mzep->mze_value;
833 za.za_name = mzep->mze_name;
834 err = cb(&za, arg, stack);
835 if (err != 0)
836 return (err);
839 return (0);
843 * Get the dnode of an object number from the metadnode of an object set.
845 * Input
846 * mdn - metadnode to get the object dnode
847 * objnum - object number for the object dnode
848 * type - if nonzero, object must be of this type
849 * buf - data buffer that holds the returning dnode
850 * stack - scratch area
852 * Return:
853 * 0 - success
854 * errnum - failure
856 static int
857 dnode_get(dnode_phys_t *mdn, uint64_t objnum, uint8_t type, dnode_phys_t *buf,
858 char *stack)
860 uint64_t blkid, blksz; /* the block id this object dnode is in */
861 int epbs; /* shift of number of dnodes in a block */
862 int idx; /* index within a block */
863 dnode_phys_t *dnbuf;
865 blksz = mdn->dn_datablkszsec << SPA_MINBLOCKSHIFT;
866 epbs = zfs_log2(blksz) - DNODE_SHIFT;
867 blkid = objnum >> epbs;
868 idx = objnum & ((1<<epbs)-1);
870 if (dnode_buf != NULL && dnode_mdn == mdn &&
871 objnum >= dnode_start && objnum < dnode_end) {
872 grub_memmove(buf, &dnode_buf[idx], DNODE_SIZE);
873 VERIFY_DN_TYPE(buf, type);
874 return (0);
877 if (dnode_buf && blksz == 1<<DNODE_BLOCK_SHIFT) {
878 dnbuf = dnode_buf;
879 dnode_mdn = mdn;
880 dnode_start = blkid << epbs;
881 dnode_end = (blkid + 1) << epbs;
882 } else {
883 dnbuf = (dnode_phys_t *)stack;
884 stack += blksz;
887 if (errnum = dmu_read(mdn, blkid, (char *)dnbuf, stack))
888 return (errnum);
890 grub_memmove(buf, &dnbuf[idx], DNODE_SIZE);
891 VERIFY_DN_TYPE(buf, type);
893 return (0);
897 * Check if this is a special file that resides at the top
898 * dataset of the pool. Currently this is the GRUB menu,
899 * boot signature and boot signature backup.
900 * str starts with '/'.
902 static int
903 is_top_dataset_file(char *str)
905 char *tptr;
907 if ((tptr = grub_strstr(str, "menu.lst")) &&
908 (tptr[8] == '\0' || tptr[8] == ' ') &&
909 *(tptr-1) == '/')
910 return (1);
912 if (grub_strncmp(str, BOOTSIGN_DIR"/",
913 grub_strlen(BOOTSIGN_DIR) + 1) == 0)
914 return (1);
916 if (grub_strcmp(str, BOOTSIGN_BACKUP) == 0)
917 return (1);
919 return (0);
922 static int
923 check_feature(zap_attribute_t *za, void *arg, char *stack)
925 const char **names = arg;
926 int i;
928 if (za->za_first_integer == 0)
929 return (0);
931 for (i = 0; names[i] != NULL; i++) {
932 if (grub_strcmp(za->za_name, names[i]) == 0) {
933 return (0);
936 grub_printf("missing feature for read '%s'\n", za->za_name);
937 return (ERR_NEWER_VERSION);
941 * Get the file dnode for a given file name where mdn is the meta dnode
942 * for this ZFS object set. When found, place the file dnode in dn.
943 * The 'path' argument will be mangled.
945 * Return:
946 * 0 - success
947 * errnum - failure
949 static int
950 dnode_get_path(dnode_phys_t *mdn, char *path, dnode_phys_t *dn,
951 char *stack)
953 uint64_t objnum, version;
954 char *cname, ch;
956 if (errnum = dnode_get(mdn, MASTER_NODE_OBJ, DMU_OT_MASTER_NODE,
957 dn, stack))
958 return (errnum);
960 if (errnum = zap_lookup(dn, ZPL_VERSION_STR, &version, stack))
961 return (errnum);
962 if (version > ZPL_VERSION)
963 return (-1);
965 if (errnum = zap_lookup(dn, ZFS_ROOT_OBJ, &objnum, stack))
966 return (errnum);
968 if (errnum = dnode_get(mdn, objnum, DMU_OT_DIRECTORY_CONTENTS,
969 dn, stack))
970 return (errnum);
972 /* skip leading slashes */
973 while (*path == '/')
974 path++;
976 while (*path && !grub_isspace(*path)) {
978 /* get the next component name */
979 cname = path;
980 while (*path && !grub_isspace(*path) && *path != '/')
981 path++;
982 ch = *path;
983 *path = 0; /* ensure null termination */
985 if (errnum = zap_lookup(dn, cname, &objnum, stack))
986 return (errnum);
988 objnum = ZFS_DIRENT_OBJ(objnum);
989 if (errnum = dnode_get(mdn, objnum, 0, dn, stack))
990 return (errnum);
992 *path = ch;
993 while (*path == '/')
994 path++;
997 /* We found the dnode for this file. Verify if it is a plain file. */
998 VERIFY_DN_TYPE(dn, DMU_OT_PLAIN_FILE_CONTENTS);
1000 return (0);
1004 * Get the default 'bootfs' property value from the rootpool.
1006 * Return:
1007 * 0 - success
1008 * errnum -failure
1010 static int
1011 get_default_bootfsobj(dnode_phys_t *mosmdn, uint64_t *obj, char *stack)
1013 uint64_t objnum = 0;
1014 dnode_phys_t *dn = (dnode_phys_t *)stack;
1015 stack += DNODE_SIZE;
1017 if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1018 DMU_OT_OBJECT_DIRECTORY, dn, stack))
1019 return (errnum);
1022 * find the object number for 'pool_props', and get the dnode
1023 * of the 'pool_props'.
1025 if (zap_lookup(dn, DMU_POOL_PROPS, &objnum, stack))
1026 return (ERR_FILESYSTEM_NOT_FOUND);
1028 if (errnum = dnode_get(mosmdn, objnum, DMU_OT_POOL_PROPS, dn, stack))
1029 return (errnum);
1031 if (zap_lookup(dn, ZPOOL_PROP_BOOTFS, &objnum, stack))
1032 return (ERR_FILESYSTEM_NOT_FOUND);
1034 if (!objnum)
1035 return (ERR_FILESYSTEM_NOT_FOUND);
1037 *obj = objnum;
1038 return (0);
1042 * List of pool features that the grub implementation of ZFS supports for
1043 * read. Note that features that are only required for write do not need
1044 * to be listed here since grub opens pools in read-only mode.
1046 * When this list is updated the version number in usr/src/grub/capability
1047 * must be incremented to ensure the new grub gets installed.
1049 static const char *spa_feature_names[] = {
1050 "org.illumos:lz4_compress",
1051 "com.delphix:hole_birth",
1052 "com.delphix:extensible_dataset",
1053 "com.delphix:embedded_data",
1054 "org.open-zfs:large_blocks",
1055 "org.illumos:sha512",
1056 NULL
1060 * Checks whether the MOS features that are active are supported by this
1061 * (GRUB's) implementation of ZFS.
1063 * Return:
1064 * 0: Success.
1065 * errnum: Failure.
1067 static int
1068 check_mos_features(dnode_phys_t *mosmdn, char *stack)
1070 uint64_t objnum;
1071 dnode_phys_t *dn;
1072 uint8_t error = 0;
1074 dn = (dnode_phys_t *)stack;
1075 stack += DNODE_SIZE;
1077 if ((errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1078 DMU_OT_OBJECT_DIRECTORY, dn, stack)) != 0)
1079 return (errnum);
1082 * Find the object number for 'features_for_read' and retrieve its
1083 * corresponding dnode. Note that we don't check features_for_write
1084 * because GRUB is not opening the pool for write.
1086 if ((errnum = zap_lookup(dn, DMU_POOL_FEATURES_FOR_READ, &objnum,
1087 stack)) != 0)
1088 return (errnum);
1090 if ((errnum = dnode_get(mosmdn, objnum, DMU_OTN_ZAP_METADATA,
1091 dn, stack)) != 0)
1092 return (errnum);
1094 return (zap_iterate(dn, check_feature, spa_feature_names, stack));
1098 * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname),
1099 * e.g. pool/rootfs, or a given object number (obj), e.g. the object number
1100 * of pool/rootfs.
1102 * If no fsname and no obj are given, return the DSL_DIR metadnode.
1103 * If fsname is given, return its metadnode and its matching object number.
1104 * If only obj is given, return the metadnode for this object number.
1106 * Return:
1107 * 0 - success
1108 * errnum - failure
1110 static int
1111 get_objset_mdn(dnode_phys_t *mosmdn, char *fsname, uint64_t *obj,
1112 dnode_phys_t *mdn, char *stack)
1114 uint64_t objnum, headobj;
1115 char *cname, ch;
1116 blkptr_t *bp;
1117 objset_phys_t *osp;
1118 int issnapshot = 0;
1119 char *snapname;
1121 if (fsname == NULL && obj) {
1122 headobj = *obj;
1123 goto skip;
1126 if (errnum = dnode_get(mosmdn, DMU_POOL_DIRECTORY_OBJECT,
1127 DMU_OT_OBJECT_DIRECTORY, mdn, stack))
1128 return (errnum);
1130 if (errnum = zap_lookup(mdn, DMU_POOL_ROOT_DATASET, &objnum,
1131 stack))
1132 return (errnum);
1134 if (errnum = dnode_get(mosmdn, objnum, 0, mdn, stack))
1135 return (errnum);
1137 if (fsname == NULL) {
1138 headobj =
1139 ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
1140 goto skip;
1143 /* take out the pool name */
1144 while (*fsname && !grub_isspace(*fsname) && *fsname != '/')
1145 fsname++;
1147 while (*fsname && !grub_isspace(*fsname)) {
1148 uint64_t childobj;
1150 while (*fsname == '/')
1151 fsname++;
1153 cname = fsname;
1154 while (*fsname && !grub_isspace(*fsname) && *fsname != '/')
1155 fsname++;
1156 ch = *fsname;
1157 *fsname = 0;
1159 snapname = cname;
1160 while (*snapname && !grub_isspace(*snapname) && *snapname !=
1161 '@')
1162 snapname++;
1163 if (*snapname == '@') {
1164 issnapshot = 1;
1165 *snapname = 0;
1167 childobj =
1168 ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_child_dir_zapobj;
1169 if (errnum = dnode_get(mosmdn, childobj,
1170 DMU_OT_DSL_DIR_CHILD_MAP, mdn, stack))
1171 return (errnum);
1173 if (zap_lookup(mdn, cname, &objnum, stack))
1174 return (ERR_FILESYSTEM_NOT_FOUND);
1176 if (errnum = dnode_get(mosmdn, objnum, 0,
1177 mdn, stack))
1178 return (errnum);
1180 *fsname = ch;
1181 if (issnapshot)
1182 *snapname = '@';
1184 headobj = ((dsl_dir_phys_t *)DN_BONUS(mdn))->dd_head_dataset_obj;
1185 if (obj)
1186 *obj = headobj;
1188 skip:
1189 if (errnum = dnode_get(mosmdn, headobj, 0, mdn, stack))
1190 return (errnum);
1191 if (issnapshot) {
1192 uint64_t snapobj;
1194 snapobj = ((dsl_dataset_phys_t *)DN_BONUS(mdn))->
1195 ds_snapnames_zapobj;
1197 if (errnum = dnode_get(mosmdn, snapobj,
1198 DMU_OT_DSL_DS_SNAP_MAP, mdn, stack))
1199 return (errnum);
1200 if (zap_lookup(mdn, snapname + 1, &headobj, stack))
1201 return (ERR_FILESYSTEM_NOT_FOUND);
1202 if (errnum = dnode_get(mosmdn, headobj, 0, mdn, stack))
1203 return (errnum);
1204 if (obj)
1205 *obj = headobj;
1208 bp = &((dsl_dataset_phys_t *)DN_BONUS(mdn))->ds_bp;
1209 osp = (objset_phys_t *)stack;
1210 stack += sizeof (objset_phys_t);
1211 if (errnum = zio_read(bp, osp, stack))
1212 return (errnum);
1214 grub_memmove((char *)mdn, (char *)&osp->os_meta_dnode, DNODE_SIZE);
1216 return (0);
1220 * For a given XDR packed nvlist, verify the first 4 bytes and move on.
1222 * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) :
1224 * encoding method/host endian (4 bytes)
1225 * nvl_version (4 bytes)
1226 * nvl_nvflag (4 bytes)
1227 * encoded nvpairs:
1228 * encoded size of the nvpair (4 bytes)
1229 * decoded size of the nvpair (4 bytes)
1230 * name string size (4 bytes)
1231 * name string data (sizeof(NV_ALIGN4(string))
1232 * data type (4 bytes)
1233 * # of elements in the nvpair (4 bytes)
1234 * data
1235 * 2 zero's for the last nvpair
1236 * (end of the entire list) (8 bytes)
1238 * Return:
1239 * 0 - success
1240 * 1 - failure
1242 static int
1243 nvlist_unpack(char *nvlist, char **out)
1245 /* Verify if the 1st and 2nd byte in the nvlist are valid. */
1246 if (nvlist[0] != NV_ENCODE_XDR || nvlist[1] != HOST_ENDIAN)
1247 return (1);
1249 *out = nvlist + 4;
1250 return (0);
1253 static char *
1254 nvlist_array(char *nvlist, int index)
1256 int i, encode_size;
1258 for (i = 0; i < index; i++) {
1259 /* skip the header, nvl_version, and nvl_nvflag */
1260 nvlist = nvlist + 4 * 2;
1262 while (encode_size = BSWAP_32(*(uint32_t *)nvlist))
1263 nvlist += encode_size; /* goto the next nvpair */
1265 nvlist = nvlist + 4 * 2; /* skip the ending 2 zeros - 8 bytes */
1268 return (nvlist);
1272 * The nvlist_next_nvpair() function returns a handle to the next nvpair in the
1273 * list following nvpair. If nvpair is NULL, the first pair is returned. If
1274 * nvpair is the last pair in the nvlist, NULL is returned.
1276 static char *
1277 nvlist_next_nvpair(char *nvl, char *nvpair)
1279 char *cur, *prev;
1280 int encode_size;
1282 if (nvl == NULL)
1283 return (NULL);
1285 if (nvpair == NULL) {
1286 /* skip over nvl_version and nvl_nvflag */
1287 nvpair = nvl + 4 * 2;
1288 } else {
1289 /* skip to the next nvpair */
1290 encode_size = BSWAP_32(*(uint32_t *)nvpair);
1291 nvpair += encode_size;
1294 /* 8 bytes of 0 marks the end of the list */
1295 if (*(uint64_t *)nvpair == 0)
1296 return (NULL);
1298 return (nvpair);
1302 * This function returns 0 on success and 1 on failure. On success, a string
1303 * containing the name of nvpair is saved in buf.
1305 static int
1306 nvpair_name(char *nvp, char *buf, int buflen)
1308 int len;
1310 /* skip over encode/decode size */
1311 nvp += 4 * 2;
1313 len = BSWAP_32(*(uint32_t *)nvp);
1314 if (buflen < len + 1)
1315 return (1);
1317 grub_memmove(buf, nvp + 4, len);
1318 buf[len] = '\0';
1320 return (0);
1324 * This function retrieves the value of the nvpair in the form of enumerated
1325 * type data_type_t. This is used to determine the appropriate type to pass to
1326 * nvpair_value().
1328 static int
1329 nvpair_type(char *nvp)
1331 int name_len, type;
1333 /* skip over encode/decode size */
1334 nvp += 4 * 2;
1336 /* skip over name_len */
1337 name_len = BSWAP_32(*(uint32_t *)nvp);
1338 nvp += 4;
1340 /* skip over name */
1341 nvp = nvp + ((name_len + 3) & ~3); /* align */
1343 type = BSWAP_32(*(uint32_t *)nvp);
1345 return (type);
1348 static int
1349 nvpair_value(char *nvp, void *val, int valtype, int *nelmp)
1351 int name_len, type, slen;
1352 char *strval = val;
1353 uint64_t *intval = val;
1355 /* skip over encode/decode size */
1356 nvp += 4 * 2;
1358 /* skip over name_len */
1359 name_len = BSWAP_32(*(uint32_t *)nvp);
1360 nvp += 4;
1362 /* skip over name */
1363 nvp = nvp + ((name_len + 3) & ~3); /* align */
1365 /* skip over type */
1366 type = BSWAP_32(*(uint32_t *)nvp);
1367 nvp += 4;
1369 if (type == valtype) {
1370 int nelm;
1372 nelm = BSWAP_32(*(uint32_t *)nvp);
1373 if (valtype != DATA_TYPE_BOOLEAN && nelm < 1)
1374 return (1);
1375 nvp += 4;
1377 switch (valtype) {
1378 case DATA_TYPE_BOOLEAN:
1379 return (0);
1381 case DATA_TYPE_STRING:
1382 slen = BSWAP_32(*(uint32_t *)nvp);
1383 nvp += 4;
1384 grub_memmove(strval, nvp, slen);
1385 strval[slen] = '\0';
1386 return (0);
1388 case DATA_TYPE_UINT64:
1389 *intval = BSWAP_64(*(uint64_t *)nvp);
1390 return (0);
1392 case DATA_TYPE_NVLIST:
1393 *(void **)val = (void *)nvp;
1394 return (0);
1396 case DATA_TYPE_NVLIST_ARRAY:
1397 *(void **)val = (void *)nvp;
1398 if (nelmp)
1399 *nelmp = nelm;
1400 return (0);
1404 return (1);
1407 static int
1408 nvlist_lookup_value(char *nvlist, char *name, void *val, int valtype,
1409 int *nelmp)
1411 char *nvpair;
1413 for (nvpair = nvlist_next_nvpair(nvlist, NULL);
1414 nvpair != NULL;
1415 nvpair = nvlist_next_nvpair(nvlist, nvpair)) {
1416 int name_len = BSWAP_32(*(uint32_t *)(nvpair + 4 * 2));
1417 char *nvp_name = nvpair + 4 * 3;
1419 if ((grub_strncmp(nvp_name, name, name_len) == 0) &&
1420 nvpair_type(nvpair) == valtype) {
1421 return (nvpair_value(nvpair, val, valtype, nelmp));
1424 return (1);
1428 * Check if this vdev is online and is in a good state.
1430 static int
1431 vdev_validate(char *nv)
1433 uint64_t ival;
1435 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_OFFLINE, &ival,
1436 DATA_TYPE_UINT64, NULL) == 0 ||
1437 nvlist_lookup_value(nv, ZPOOL_CONFIG_FAULTED, &ival,
1438 DATA_TYPE_UINT64, NULL) == 0 ||
1439 nvlist_lookup_value(nv, ZPOOL_CONFIG_REMOVED, &ival,
1440 DATA_TYPE_UINT64, NULL) == 0)
1441 return (ERR_DEV_VALUES);
1443 return (0);
1447 * Get a valid vdev pathname/devid from the boot device.
1448 * The caller should already allocate MAXPATHLEN memory for bootpath and devid.
1450 static int
1451 vdev_get_bootpath(char *nv, uint64_t inguid, char *devid, char *bootpath,
1452 int is_spare)
1454 char type[16];
1456 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_TYPE, &type, DATA_TYPE_STRING,
1457 NULL))
1458 return (ERR_FSYS_CORRUPT);
1460 if (grub_strcmp(type, VDEV_TYPE_DISK) == 0) {
1461 uint64_t guid;
1463 if (vdev_validate(nv) != 0)
1464 return (ERR_NO_BOOTPATH);
1466 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_GUID,
1467 &guid, DATA_TYPE_UINT64, NULL) != 0)
1468 return (ERR_NO_BOOTPATH);
1470 if (guid != inguid)
1471 return (ERR_NO_BOOTPATH);
1473 /* for a spare vdev, pick the disk labeled with "is_spare" */
1474 if (is_spare) {
1475 uint64_t spare = 0;
1476 (void) nvlist_lookup_value(nv, ZPOOL_CONFIG_IS_SPARE,
1477 &spare, DATA_TYPE_UINT64, NULL);
1478 if (!spare)
1479 return (ERR_NO_BOOTPATH);
1482 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_PHYS_PATH,
1483 bootpath, DATA_TYPE_STRING, NULL) != 0)
1484 bootpath[0] = '\0';
1486 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_DEVID,
1487 devid, DATA_TYPE_STRING, NULL) != 0)
1488 devid[0] = '\0';
1490 if (grub_strlen(bootpath) >= MAXPATHLEN ||
1491 grub_strlen(devid) >= MAXPATHLEN)
1492 return (ERR_WONT_FIT);
1494 return (0);
1496 } else if (grub_strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
1497 grub_strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
1498 (is_spare = (grub_strcmp(type, VDEV_TYPE_SPARE) == 0))) {
1499 int nelm, i;
1500 char *child;
1502 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_CHILDREN, &child,
1503 DATA_TYPE_NVLIST_ARRAY, &nelm))
1504 return (ERR_FSYS_CORRUPT);
1506 for (i = 0; i < nelm; i++) {
1507 char *child_i;
1509 child_i = nvlist_array(child, i);
1510 if (vdev_get_bootpath(child_i, inguid, devid,
1511 bootpath, is_spare) == 0)
1512 return (0);
1516 return (ERR_NO_BOOTPATH);
1520 * Check the disk label information and retrieve needed vdev name-value pairs.
1522 * Return:
1523 * 0 - success
1524 * ERR_* - failure
1526 static int
1527 check_pool_label(uint64_t sector, char *stack, char *outdevid,
1528 char *outpath, uint64_t *outguid, uint64_t *outashift, uint64_t *outversion)
1530 vdev_phys_t *vdev;
1531 uint64_t pool_state, txg = 0;
1532 char *nvlist, *nv, *features;
1533 uint64_t diskguid;
1535 sector += (VDEV_SKIP_SIZE >> SPA_MINBLOCKSHIFT);
1537 /* Read in the vdev name-value pair list (112K). */
1538 if (devread(sector, 0, VDEV_PHYS_SIZE, stack) == 0)
1539 return (ERR_READ);
1541 vdev = (vdev_phys_t *)stack;
1542 stack += sizeof (vdev_phys_t);
1544 if (nvlist_unpack(vdev->vp_nvlist, &nvlist))
1545 return (ERR_FSYS_CORRUPT);
1547 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_STATE, &pool_state,
1548 DATA_TYPE_UINT64, NULL))
1549 return (ERR_FSYS_CORRUPT);
1551 if (pool_state == POOL_STATE_DESTROYED)
1552 return (ERR_FILESYSTEM_NOT_FOUND);
1554 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_NAME,
1555 current_rootpool, DATA_TYPE_STRING, NULL))
1556 return (ERR_FSYS_CORRUPT);
1558 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_TXG, &txg,
1559 DATA_TYPE_UINT64, NULL))
1560 return (ERR_FSYS_CORRUPT);
1562 /* not an active device */
1563 if (txg == 0)
1564 return (ERR_NO_BOOTPATH);
1566 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VERSION, outversion,
1567 DATA_TYPE_UINT64, NULL))
1568 return (ERR_FSYS_CORRUPT);
1569 if (!SPA_VERSION_IS_SUPPORTED(*outversion))
1570 return (ERR_NEWER_VERSION);
1571 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_VDEV_TREE, &nv,
1572 DATA_TYPE_NVLIST, NULL))
1573 return (ERR_FSYS_CORRUPT);
1574 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_GUID, &diskguid,
1575 DATA_TYPE_UINT64, NULL))
1576 return (ERR_FSYS_CORRUPT);
1577 if (nvlist_lookup_value(nv, ZPOOL_CONFIG_ASHIFT, outashift,
1578 DATA_TYPE_UINT64, NULL) != 0)
1579 return (ERR_FSYS_CORRUPT);
1580 if (vdev_get_bootpath(nv, diskguid, outdevid, outpath, 0))
1581 return (ERR_NO_BOOTPATH);
1582 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_POOL_GUID, outguid,
1583 DATA_TYPE_UINT64, NULL))
1584 return (ERR_FSYS_CORRUPT);
1586 if (nvlist_lookup_value(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1587 &features, DATA_TYPE_NVLIST, NULL) == 0) {
1588 char *nvp;
1589 char *name = stack;
1590 stack += MAXNAMELEN;
1592 for (nvp = nvlist_next_nvpair(features, NULL);
1593 nvp != NULL;
1594 nvp = nvlist_next_nvpair(features, nvp)) {
1595 zap_attribute_t za;
1597 if (nvpair_name(nvp, name, MAXNAMELEN) != 0)
1598 return (ERR_FSYS_CORRUPT);
1600 za.za_integer_length = 8;
1601 za.za_num_integers = 1;
1602 za.za_first_integer = 1;
1603 za.za_name = name;
1604 if (check_feature(&za, spa_feature_names, stack) != 0)
1605 return (ERR_NEWER_VERSION);
1609 return (0);
1613 * zfs_mount() locates a valid uberblock of the root pool and read in its MOS
1614 * to the memory address MOS.
1616 * Return:
1617 * 1 - success
1618 * 0 - failure
1621 zfs_mount(void)
1623 char *stack, *ub_array;
1624 int label = 0;
1625 uberblock_t *ubbest;
1626 objset_phys_t *osp;
1627 char tmp_bootpath[MAXNAMELEN];
1628 char tmp_devid[MAXNAMELEN];
1629 uint64_t tmp_guid, ashift, version;
1630 uint64_t adjpl = (uint64_t)part_length << SPA_MINBLOCKSHIFT;
1631 int err = errnum; /* preserve previous errnum state */
1633 /* if it's our first time here, zero the best uberblock out */
1634 if (best_drive == 0 && best_part == 0 && find_best_root) {
1635 grub_memset(&current_uberblock, 0, sizeof (uberblock_t));
1636 pool_guid = 0;
1639 stackbase = ZFS_SCRATCH;
1640 stack = stackbase;
1641 ub_array = stack;
1642 stack += VDEV_UBERBLOCK_RING;
1644 osp = (objset_phys_t *)stack;
1645 stack += sizeof (objset_phys_t);
1646 adjpl = P2ALIGN(adjpl, (uint64_t)sizeof (vdev_label_t));
1648 for (label = 0; label < VDEV_LABELS; label++) {
1651 * some eltorito stacks don't give us a size and
1652 * we end up setting the size to MAXUINT, further
1653 * some of these devices stop working once a single
1654 * read past the end has been issued. Checking
1655 * for a maximum part_length and skipping the backup
1656 * labels at the end of the slice/partition/device
1657 * avoids breaking down on such devices.
1659 if (part_length == MAXUINT && label == 2)
1660 break;
1662 uint64_t sector = vdev_label_start(adjpl,
1663 label) >> SPA_MINBLOCKSHIFT;
1665 /* Read in the uberblock ring (128K). */
1666 if (devread(sector +
1667 ((VDEV_SKIP_SIZE + VDEV_PHYS_SIZE) >> SPA_MINBLOCKSHIFT),
1668 0, VDEV_UBERBLOCK_RING, ub_array) == 0)
1669 continue;
1671 if (check_pool_label(sector, stack, tmp_devid,
1672 tmp_bootpath, &tmp_guid, &ashift, &version))
1673 continue;
1675 if (pool_guid == 0)
1676 pool_guid = tmp_guid;
1678 if ((ubbest = find_bestub(ub_array, ashift, sector)) == NULL ||
1679 zio_read(&ubbest->ub_rootbp, osp, stack) != 0)
1680 continue;
1682 VERIFY_OS_TYPE(osp, DMU_OST_META);
1684 if (version >= SPA_VERSION_FEATURES &&
1685 check_mos_features(&osp->os_meta_dnode, stack) != 0)
1686 continue;
1688 if (find_best_root && ((pool_guid != tmp_guid) ||
1689 vdev_uberblock_compare(ubbest, &(current_uberblock)) <= 0))
1690 continue;
1692 /* Got the MOS. Save it at the memory addr MOS. */
1693 grub_memmove(MOS, &osp->os_meta_dnode, DNODE_SIZE);
1694 grub_memmove(&current_uberblock, ubbest, sizeof (uberblock_t));
1695 grub_memmove(current_bootpath, tmp_bootpath, MAXNAMELEN);
1696 grub_memmove(current_devid, tmp_devid, grub_strlen(tmp_devid));
1697 is_zfs_mount = 1;
1698 return (1);
1702 * While some fs impls. (tftp) rely on setting and keeping
1703 * global errnums set, others won't reset it and will break
1704 * when issuing rawreads. The goal here is to simply not
1705 * have zfs mount attempts impact the previous state.
1707 errnum = err;
1708 return (0);
1712 * zfs_open() locates a file in the rootpool by following the
1713 * MOS and places the dnode of the file in the memory address DNODE.
1715 * Return:
1716 * 1 - success
1717 * 0 - failure
1720 zfs_open(char *filename)
1722 char *stack;
1723 dnode_phys_t *mdn;
1725 file_buf = NULL;
1726 stackbase = ZFS_SCRATCH;
1727 stack = stackbase;
1729 mdn = (dnode_phys_t *)stack;
1730 stack += sizeof (dnode_phys_t);
1732 dnode_mdn = NULL;
1733 dnode_buf = (dnode_phys_t *)stack;
1734 stack += 1<<DNODE_BLOCK_SHIFT;
1737 * menu.lst is placed at the root pool filesystem level,
1738 * do not goto 'current_bootfs'.
1740 if (is_top_dataset_file(filename)) {
1741 if (errnum = get_objset_mdn(MOS, NULL, NULL, mdn, stack))
1742 return (0);
1744 current_bootfs_obj = 0;
1745 } else {
1746 if (current_bootfs[0] == '\0') {
1747 /* Get the default root filesystem object number */
1748 if (errnum = get_default_bootfsobj(MOS,
1749 &current_bootfs_obj, stack))
1750 return (0);
1752 if (errnum = get_objset_mdn(MOS, NULL,
1753 &current_bootfs_obj, mdn, stack))
1754 return (0);
1755 } else {
1756 if (errnum = get_objset_mdn(MOS, current_bootfs,
1757 &current_bootfs_obj, mdn, stack)) {
1758 grub_memset(current_bootfs, 0, MAXNAMELEN);
1759 return (0);
1764 if (dnode_get_path(mdn, filename, DNODE, stack)) {
1765 errnum = ERR_FILE_NOT_FOUND;
1766 return (0);
1769 /* get the file size and set the file position to 0 */
1772 * For DMU_OT_SA we will need to locate the SIZE attribute
1773 * attribute, which could be either in the bonus buffer
1774 * or the "spill" block.
1776 if (DNODE->dn_bonustype == DMU_OT_SA) {
1777 sa_hdr_phys_t *sahdrp;
1778 int hdrsize;
1780 if (DNODE->dn_bonuslen != 0) {
1781 sahdrp = (sa_hdr_phys_t *)DN_BONUS(DNODE);
1782 } else {
1783 if (DNODE->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
1784 blkptr_t *bp = &DNODE->dn_spill;
1785 void *buf;
1787 buf = (void *)stack;
1788 stack += BP_GET_LSIZE(bp);
1790 /* reset errnum to rawread() failure */
1791 errnum = 0;
1792 if (zio_read(bp, buf, stack) != 0) {
1793 return (0);
1795 sahdrp = buf;
1796 } else {
1797 errnum = ERR_FSYS_CORRUPT;
1798 return (0);
1801 hdrsize = SA_HDR_SIZE(sahdrp);
1802 filemax = *(uint64_t *)((char *)sahdrp + hdrsize +
1803 SA_SIZE_OFFSET);
1804 } else {
1805 filemax = ((znode_phys_t *)DN_BONUS(DNODE))->zp_size;
1807 filepos = 0;
1809 dnode_buf = NULL;
1810 return (1);
1814 * zfs_read reads in the data blocks pointed by the DNODE.
1816 * Return:
1817 * len - the length successfully read in to the buffer
1818 * 0 - failure
1821 zfs_read(char *buf, int len)
1823 char *stack;
1824 int blksz, length, movesize;
1826 if (file_buf == NULL) {
1827 file_buf = stackbase;
1828 stackbase += SPA_MAXBLOCKSIZE;
1829 file_start = file_end = 0;
1831 stack = stackbase;
1834 * If offset is in memory, move it into the buffer provided and return.
1836 if (filepos >= file_start && filepos+len <= file_end) {
1837 grub_memmove(buf, file_buf + filepos - file_start, len);
1838 filepos += len;
1839 return (len);
1842 blksz = DNODE->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1845 * Note: for GRUB, SPA_MAXBLOCKSIZE is 128KB. There is not enough
1846 * memory to allocate the new max blocksize (16MB), so while
1847 * GRUB understands the large_blocks on-disk feature, it can't
1848 * actually read large blocks.
1850 if (blksz > SPA_MAXBLOCKSIZE) {
1851 grub_printf("blocks larger than 128K are not supported\n");
1852 return (0);
1856 * Entire Dnode is too big to fit into the space available. We
1857 * will need to read it in chunks. This could be optimized to
1858 * read in as large a chunk as there is space available, but for
1859 * now, this only reads in one data block at a time.
1861 length = len;
1862 while (length) {
1864 * Find requested blkid and the offset within that block.
1866 uint64_t blkid = filepos / blksz;
1868 if (errnum = dmu_read(DNODE, blkid, file_buf, stack))
1869 return (0);
1871 file_start = blkid * blksz;
1872 file_end = file_start + blksz;
1874 movesize = MIN(length, file_end - filepos);
1876 grub_memmove(buf, file_buf + filepos - file_start,
1877 movesize);
1878 buf += movesize;
1879 length -= movesize;
1880 filepos += movesize;
1883 return (len);
1887 * No-Op
1890 zfs_embed(int *start_sector, int needed_sectors)
1892 return (1);
1895 #endif /* FSYS_ZFS */