2 * GRUB -- GRand Unified Bootloader
3 * Copyright (C) 1999,2000,2001,2002,2003,2004 Free Software Foundation, Inc.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
22 * Use is subject to license terms.
26 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
27 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
31 * The zfs plug-in routines for GRUB are:
33 * zfs_mount() - locates a valid uberblock of the root pool and reads
34 * in its MOS at the memory address MOS.
36 * zfs_open() - locates a plain file object by following the MOS
37 * and places its dnode at the memory address DNODE.
39 * zfs_read() - read in the data blocks pointed by the DNODE.
41 * ZFS_SCRATCH is used as a working area.
43 * (memory addr) MOS DNODE ZFS_SCRATCH
45 * +-------V---------V----------V---------------+
46 * memory | | dnode | dnode | scratch |
47 * | | 512B | 512B | area |
48 * +--------------------------------------------+
57 /* cache for a file block of the currently zfs_open()-ed file */
58 static void *file_buf
= NULL
;
59 static uint64_t file_start
= 0;
60 static uint64_t file_end
= 0;
62 /* cache for a dnode block */
63 static dnode_phys_t
*dnode_buf
= NULL
;
64 static dnode_phys_t
*dnode_mdn
= NULL
;
65 static uint64_t dnode_start
= 0;
66 static uint64_t dnode_end
= 0;
68 static uint64_t pool_guid
= 0;
69 static uberblock_t current_uberblock
;
70 static char *stackbase
;
72 decomp_entry_t decomp_table
[ZIO_COMPRESS_FUNCTIONS
] =
74 {"inherit", 0}, /* ZIO_COMPRESS_INHERIT */
75 {"on", lzjb_decompress
}, /* ZIO_COMPRESS_ON */
76 {"off", 0}, /* ZIO_COMPRESS_OFF */
77 {"lzjb", lzjb_decompress
}, /* ZIO_COMPRESS_LZJB */
78 {"empty", 0}, /* ZIO_COMPRESS_EMPTY */
79 {"gzip-1", 0}, /* ZIO_COMPRESS_GZIP_1 */
80 {"gzip-2", 0}, /* ZIO_COMPRESS_GZIP_2 */
81 {"gzip-3", 0}, /* ZIO_COMPRESS_GZIP_3 */
82 {"gzip-4", 0}, /* ZIO_COMPRESS_GZIP_4 */
83 {"gzip-5", 0}, /* ZIO_COMPRESS_GZIP_5 */
84 {"gzip-6", 0}, /* ZIO_COMPRESS_GZIP_6 */
85 {"gzip-7", 0}, /* ZIO_COMPRESS_GZIP_7 */
86 {"gzip-8", 0}, /* ZIO_COMPRESS_GZIP_8 */
87 {"gzip-9", 0}, /* ZIO_COMPRESS_GZIP_9 */
88 {"zle", 0}, /* ZIO_COMPRESS_ZLE */
89 {"lz4", lz4_decompress
} /* ZIO_COMPRESS_LZ4 */
92 static int zio_read_data(blkptr_t
*bp
, void *buf
, char *stack
);
95 * Our own version of bcmp().
98 zfs_bcmp(const void *s1
, const void *s2
, size_t n
)
100 const uchar_t
*ps1
= s1
;
101 const uchar_t
*ps2
= s2
;
103 if (s1
!= s2
&& n
!= 0) {
105 if (*ps1
++ != *ps2
++)
114 * Our own version of log2(). Same thing as highbit()-1.
117 zfs_log2(uint64_t num
)
129 /* Checksum Functions */
131 zio_checksum_off(const void *buf
, uint64_t size
, zio_cksum_t
*zcp
)
133 ZIO_SET_CHECKSUM(zcp
, 0, 0, 0, 0);
136 /* Checksum Table and Values */
137 zio_checksum_info_t zio_checksum_table
[ZIO_CHECKSUM_FUNCTIONS
] = {
138 {{NULL
, NULL
}, 0, 0, "inherit"},
139 {{NULL
, NULL
}, 0, 0, "on"},
140 {{zio_checksum_off
, zio_checksum_off
}, 0, 0, "off"},
141 {{zio_checksum_SHA256
, zio_checksum_SHA256
}, 1, 1, "label"},
142 {{zio_checksum_SHA256
, zio_checksum_SHA256
}, 1, 1, "gang_header"},
143 {{NULL
, NULL
}, 0, 0, "zilog"},
144 {{fletcher_2_native
, fletcher_2_byteswap
}, 0, 0, "fletcher2"},
145 {{fletcher_4_native
, fletcher_4_byteswap
}, 1, 0, "fletcher4"},
146 {{zio_checksum_SHA256
, zio_checksum_SHA256
}, 1, 0, "SHA256"},
147 {{NULL
, NULL
}, 0, 0, "zilog2"},
148 {{zio_checksum_off
, zio_checksum_off
}, 0, 0, "noparity"},
149 {{zio_checksum_SHA512
, NULL
}, 0, 0, "SHA512"}
153 * zio_checksum_verify: Provides support for checksum verification.
155 * Fletcher2, Fletcher4, SHA-256 and SHA-512/256 are supported.
162 zio_checksum_verify(blkptr_t
*bp
, char *data
, int size
)
164 zio_cksum_t zc
= bp
->blk_cksum
;
165 uint32_t checksum
= BP_GET_CHECKSUM(bp
);
166 int byteswap
= BP_SHOULD_BYTESWAP(bp
);
167 zio_eck_t
*zec
= (zio_eck_t
*)(data
+ size
) - 1;
168 zio_checksum_info_t
*ci
= &zio_checksum_table
[checksum
];
169 zio_cksum_t actual_cksum
, expected_cksum
;
172 grub_printf("byteswap not supported\n");
176 if (checksum
>= ZIO_CHECKSUM_FUNCTIONS
|| ci
->ci_func
[0] == NULL
) {
177 grub_printf("checksum algorithm %u not supported\n", checksum
);
182 expected_cksum
= zec
->zec_cksum
;
184 ci
->ci_func
[0](data
, size
, &actual_cksum
);
185 zec
->zec_cksum
= expected_cksum
;
188 ci
->ci_func
[byteswap
](data
, size
, &actual_cksum
);
191 if ((actual_cksum
.zc_word
[0] - zc
.zc_word
[0]) |
192 (actual_cksum
.zc_word
[1] - zc
.zc_word
[1]) |
193 (actual_cksum
.zc_word
[2] - zc
.zc_word
[2]) |
194 (actual_cksum
.zc_word
[3] - zc
.zc_word
[3]))
201 * vdev_label_start returns the physical disk offset (in bytes) of
205 vdev_label_start(uint64_t psize
, int l
)
207 return (l
* sizeof (vdev_label_t
) + (l
< VDEV_LABELS
/ 2 ?
208 0 : psize
- VDEV_LABELS
* sizeof (vdev_label_t
)));
212 * vdev_uberblock_compare takes two uberblock structures and returns an integer
213 * indicating the more recent of the two.
214 * Return Value = 1 if ub2 is more recent
215 * Return Value = -1 if ub1 is more recent
216 * The most recent uberblock is determined using its transaction number and
217 * timestamp. The uberblock with the highest transaction number is
218 * considered "newer". If the transaction numbers of the two blocks match, the
219 * timestamps are compared to determine the "newer" of the two.
222 vdev_uberblock_compare(uberblock_t
*ub1
, uberblock_t
*ub2
)
224 if (ub1
->ub_txg
< ub2
->ub_txg
)
226 if (ub1
->ub_txg
> ub2
->ub_txg
)
229 if (ub1
->ub_timestamp
< ub2
->ub_timestamp
)
231 if (ub1
->ub_timestamp
> ub2
->ub_timestamp
)
238 * Three pieces of information are needed to verify an uberblock: the magic
239 * number, the version number, and the checksum.
246 uberblock_verify(uberblock_t
*uber
, uint64_t ub_size
, uint64_t offset
)
251 BP_SET_CHECKSUM(&bp
, ZIO_CHECKSUM_LABEL
);
252 BP_SET_BYTEORDER(&bp
, ZFS_HOST_BYTEORDER
);
253 ZIO_SET_CHECKSUM(&bp
.blk_cksum
, offset
, 0, 0, 0);
255 if (zio_checksum_verify(&bp
, (char *)uber
, ub_size
) != 0)
258 if (uber
->ub_magic
== UBERBLOCK_MAGIC
&&
259 SPA_VERSION_IS_SUPPORTED(uber
->ub_version
))
266 * Find the best uberblock.
268 * Success - Pointer to the best uberblock.
272 find_bestub(char *ub_array
, uint64_t ashift
, uint64_t sector
)
274 uberblock_t
*ubbest
= NULL
;
276 uint64_t offset
, ub_size
;
279 ub_size
= VDEV_UBERBLOCK_SIZE(ashift
);
281 for (i
= 0; i
< VDEV_UBERBLOCK_COUNT(ashift
); i
++) {
282 ubnext
= (uberblock_t
*)ub_array
;
284 offset
= (sector
<< SPA_MINBLOCKSHIFT
) +
285 VDEV_UBERBLOCK_OFFSET(ashift
, i
);
287 if (uberblock_verify(ubnext
, ub_size
, offset
) != 0)
290 if (ubbest
== NULL
||
291 vdev_uberblock_compare(ubnext
, ubbest
) > 0)
299 * Read a block of data based on the gang block address dva,
300 * and put its data in buf.
307 zio_read_gang(blkptr_t
*bp
, dva_t
*dva
, void *buf
, char *stack
)
309 zio_gbh_phys_t
*zio_gb
;
310 uint64_t offset
, sector
;
314 zio_gb
= (zio_gbh_phys_t
*)stack
;
315 stack
+= SPA_GANGBLOCKSIZE
;
316 offset
= DVA_GET_OFFSET(dva
);
317 sector
= DVA_OFFSET_TO_PHYS_SECTOR(offset
);
319 /* read in the gang block header */
320 if (devread(sector
, 0, SPA_GANGBLOCKSIZE
, (char *)zio_gb
) == 0) {
321 grub_printf("failed to read in a gang block header\n");
325 /* self checksuming the gang block header */
327 BP_SET_CHECKSUM(&tmpbp
, ZIO_CHECKSUM_GANG_HEADER
);
328 BP_SET_BYTEORDER(&tmpbp
, ZFS_HOST_BYTEORDER
);
329 ZIO_SET_CHECKSUM(&tmpbp
.blk_cksum
, DVA_GET_VDEV(dva
),
330 DVA_GET_OFFSET(dva
), bp
->blk_birth
, 0);
331 if (zio_checksum_verify(&tmpbp
, (char *)zio_gb
, SPA_GANGBLOCKSIZE
)) {
332 grub_printf("failed to checksum a gang block header\n");
336 for (i
= 0; i
< SPA_GBH_NBLKPTRS
; i
++) {
337 if (BP_IS_HOLE(&zio_gb
->zg_blkptr
[i
]))
340 if (zio_read_data(&zio_gb
->zg_blkptr
[i
], buf
, stack
))
342 buf
+= BP_GET_PSIZE(&zio_gb
->zg_blkptr
[i
]);
349 * Read in a block of raw data to buf.
356 zio_read_data(blkptr_t
*bp
, void *buf
, char *stack
)
360 psize
= BP_GET_PSIZE(bp
);
362 /* pick a good dva from the block pointer */
363 for (i
= 0; i
< SPA_DVAS_PER_BP
; i
++) {
364 uint64_t offset
, sector
;
366 if (bp
->blk_dva
[i
].dva_word
[0] == 0 &&
367 bp
->blk_dva
[i
].dva_word
[1] == 0)
370 if (DVA_GET_GANG(&bp
->blk_dva
[i
])) {
371 if (zio_read_gang(bp
, &bp
->blk_dva
[i
], buf
, stack
) != 0)
374 /* read in a data block */
375 offset
= DVA_GET_OFFSET(&bp
->blk_dva
[i
]);
376 sector
= DVA_OFFSET_TO_PHYS_SECTOR(offset
);
377 if (devread(sector
, 0, psize
, buf
) == 0)
381 /* verify that the checksum matches */
382 if (zio_checksum_verify(bp
, buf
, psize
) == 0) {
387 grub_printf("could not read block due to EIO or ECKSUM\n");
392 * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
393 * more than BPE_PAYLOAD_SIZE bytes).
396 decode_embedded_bp_compressed(const blkptr_t
*bp
, void *buf
)
401 const uint64_t *bp64
= (const uint64_t *)bp
;
403 psize
= BPE_GET_PSIZE(bp
);
406 * Decode the words of the block pointer into the byte array.
407 * Low bits of first word are the first byte (little endian).
409 for (i
= 0; i
< psize
; i
++) {
410 if (i
% sizeof (w
) == 0) {
411 /* beginning of a word */
414 if (!BPE_IS_PAYLOADWORD(bp
, bp64
))
417 buf8
[i
] = BF64_GET(w
, (i
% sizeof (w
)) * NBBY
, NBBY
);
422 * Fill in the buffer with the (decompressed) payload of the embedded
423 * blkptr_t. Takes into account compression and byteorder (the payload is
424 * treated as a stream of bytes).
425 * Return 0 on success, or ENOSPC if it won't fit in the buffer.
428 decode_embedded_bp(const blkptr_t
*bp
, void *buf
)
435 lsize
= BPE_GET_LSIZE(bp
);
436 psize
= BPE_GET_PSIZE(bp
);
437 comp
= BP_GET_COMPRESS(bp
);
439 if (comp
!= ZIO_COMPRESS_OFF
) {
440 uint8_t dstbuf
[BPE_PAYLOAD_SIZE
];
442 if ((unsigned int)comp
>= ZIO_COMPRESS_FUNCTIONS
||
443 decomp_table
[comp
].decomp_func
== NULL
) {
444 grub_printf("compression algorithm not supported\n");
445 return (ERR_FSYS_CORRUPT
);
448 decode_embedded_bp_compressed(bp
, dstbuf
);
449 decomp_table
[comp
].decomp_func(dstbuf
, buf
, psize
, lsize
);
451 decode_embedded_bp_compressed(bp
, buf
);
458 * Read in a block of data, verify its checksum, decompress if needed,
459 * and put the uncompressed data in buf.
466 zio_read(blkptr_t
*bp
, void *buf
, char *stack
)
468 int lsize
, psize
, comp
;
471 if (BP_IS_EMBEDDED(bp
)) {
472 if (BPE_GET_ETYPE(bp
) != BP_EMBEDDED_TYPE_DATA
) {
473 grub_printf("unsupported embedded BP (type=%u)\n",
474 (int)BPE_GET_ETYPE(bp
));
475 return (ERR_FSYS_CORRUPT
);
477 return (decode_embedded_bp(bp
, buf
));
480 comp
= BP_GET_COMPRESS(bp
);
481 lsize
= BP_GET_LSIZE(bp
);
482 psize
= BP_GET_PSIZE(bp
);
484 if ((unsigned int)comp
>= ZIO_COMPRESS_FUNCTIONS
||
485 (comp
!= ZIO_COMPRESS_OFF
&&
486 decomp_table
[comp
].decomp_func
== NULL
)) {
487 grub_printf("compression algorithm not supported\n");
488 return (ERR_FSYS_CORRUPT
);
491 if ((char *)buf
< stack
&& ((char *)buf
) + lsize
> stack
) {
492 grub_printf("not enough memory to fit %u bytes on stack\n",
494 return (ERR_WONT_FIT
);
498 if (comp
!= ZIO_COMPRESS_OFF
) {
503 if (zio_read_data(bp
, buf
, stack
) != 0) {
504 grub_printf("zio_read_data failed\n");
505 return (ERR_FSYS_CORRUPT
);
508 if (comp
!= ZIO_COMPRESS_OFF
) {
509 if (decomp_table
[comp
].decomp_func(buf
, retbuf
, psize
,
511 grub_printf("zio_read decompression failed\n");
512 return (ERR_FSYS_CORRUPT
);
520 * Get the block from a block id.
521 * push the block onto the stack.
528 dmu_read(dnode_phys_t
*dn
, uint64_t blkid
, void *buf
, char *stack
)
531 blkptr_t
*bp_array
= dn
->dn_blkptr
;
532 int epbs
= dn
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
533 blkptr_t
*bp
, *tmpbuf
;
535 bp
= (blkptr_t
*)stack
;
536 stack
+= sizeof (blkptr_t
);
538 tmpbuf
= (blkptr_t
*)stack
;
539 stack
+= 1<<dn
->dn_indblkshift
;
541 for (level
= dn
->dn_nlevels
- 1; level
>= 0; level
--) {
542 idx
= (blkid
>> (epbs
* level
)) & ((1<<epbs
)-1);
546 if (BP_IS_HOLE(bp
)) {
548 dn
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
);
550 } else if (errnum
= zio_read(bp
, tmpbuf
, stack
)) {
561 * mzap_lookup: Looks up property described by "name" and returns the value
569 mzap_lookup(mzap_phys_t
*zapobj
, int objsize
, const char *name
,
573 mzap_ent_phys_t
*mzap_ent
= zapobj
->mz_chunk
;
575 chunks
= objsize
/ MZAP_ENT_LEN
- 1;
576 for (i
= 0; i
< chunks
; i
++) {
577 if (grub_strcmp(mzap_ent
[i
].mze_name
, name
) == 0) {
578 *value
= mzap_ent
[i
].mze_value
;
583 return (ERR_FSYS_CORRUPT
);
587 zap_hash(uint64_t salt
, const char *name
)
589 static uint64_t table
[256];
594 if (table
[128] == 0) {
597 for (i
= 0; i
< 256; i
++) {
598 for (ct
= table
+ i
, *ct
= i
, j
= 8; j
> 0; j
--)
599 *ct
= (*ct
>> 1) ^ (-(*ct
& 1) &
604 if (crc
== 0 || table
[128] != ZFS_CRC64_POLY
) {
605 errnum
= ERR_FSYS_CORRUPT
;
609 for (cp
= (const uint8_t *)name
; (c
= *cp
) != '\0'; cp
++)
610 crc
= (crc
>> 8) ^ table
[(crc
^ c
) & 0xFF];
613 * Only use 28 bits, since we need 4 bits in the cookie for the
614 * collision differentiator. We MUST use the high bits, since
615 * those are the ones that we first pay attention to when
616 * choosing the bucket.
618 crc
&= ~((1ULL << (64 - 28)) - 1);
624 * Only to be used on 8-bit arrays.
625 * array_len is actual len in bytes (not encoded le_value_length).
626 * buf is null-terminated.
629 zap_leaf_array_equal(zap_leaf_phys_t
*l
, int blksft
, int chunk
,
630 int array_len
, const char *buf
)
634 while (bseen
< array_len
) {
635 struct zap_leaf_array
*la
=
636 &ZAP_LEAF_CHUNK(l
, blksft
, chunk
).l_array
;
637 int toread
= MIN(array_len
- bseen
, ZAP_LEAF_ARRAY_BYTES
);
639 if (chunk
>= ZAP_LEAF_NUMCHUNKS(blksft
))
642 if (zfs_bcmp(la
->la_array
, buf
+ bseen
, toread
) != 0)
647 return (bseen
== array_len
);
651 * Given a zap_leaf_phys_t, walk thru the zap leaf chunks to get the
652 * value for the property "name".
659 zap_leaf_lookup(zap_leaf_phys_t
*l
, int blksft
, uint64_t h
,
660 const char *name
, uint64_t *value
)
663 struct zap_leaf_entry
*le
;
665 /* Verify if this is a valid leaf block */
666 if (l
->l_hdr
.lh_block_type
!= ZBT_LEAF
)
667 return (ERR_FSYS_CORRUPT
);
668 if (l
->l_hdr
.lh_magic
!= ZAP_LEAF_MAGIC
)
669 return (ERR_FSYS_CORRUPT
);
671 for (chunk
= l
->l_hash
[LEAF_HASH(blksft
, h
)];
672 chunk
!= CHAIN_END
; chunk
= le
->le_next
) {
674 if (chunk
>= ZAP_LEAF_NUMCHUNKS(blksft
))
675 return (ERR_FSYS_CORRUPT
);
677 le
= ZAP_LEAF_ENTRY(l
, blksft
, chunk
);
679 /* Verify the chunk entry */
680 if (le
->le_type
!= ZAP_CHUNK_ENTRY
)
681 return (ERR_FSYS_CORRUPT
);
683 if (le
->le_hash
!= h
)
686 if (zap_leaf_array_equal(l
, blksft
, le
->le_name_chunk
,
687 le
->le_name_length
, name
)) {
689 struct zap_leaf_array
*la
;
692 if (le
->le_int_size
!= 8 || le
->le_value_length
!= 1)
693 return (ERR_FSYS_CORRUPT
);
695 /* get the uint64_t property value */
696 la
= &ZAP_LEAF_CHUNK(l
, blksft
,
697 le
->le_value_chunk
).l_array
;
700 *value
= (uint64_t)ip
[0] << 56 | (uint64_t)ip
[1] << 48 |
701 (uint64_t)ip
[2] << 40 | (uint64_t)ip
[3] << 32 |
702 (uint64_t)ip
[4] << 24 | (uint64_t)ip
[5] << 16 |
703 (uint64_t)ip
[6] << 8 | (uint64_t)ip
[7];
709 return (ERR_FSYS_CORRUPT
);
720 fzap_lookup(dnode_phys_t
*zap_dnode
, zap_phys_t
*zap
,
721 const char *name
, uint64_t *value
, char *stack
)
724 uint64_t hash
, idx
, blkid
;
725 int blksft
= zfs_log2(zap_dnode
->dn_datablkszsec
<< DNODE_SHIFT
);
727 /* Verify if this is a fat zap header block */
728 if (zap
->zap_magic
!= (uint64_t)ZAP_MAGIC
||
730 return (ERR_FSYS_CORRUPT
);
732 hash
= zap_hash(zap
->zap_salt
, name
);
736 /* get block id from index */
737 if (zap
->zap_ptrtbl
.zt_numblks
!= 0) {
738 /* external pointer tables not supported */
739 return (ERR_FSYS_CORRUPT
);
741 idx
= ZAP_HASH_IDX(hash
, zap
->zap_ptrtbl
.zt_shift
);
742 blkid
= ((uint64_t *)zap
)[idx
+ (1<<(blksft
-3-1))];
744 /* Get the leaf block */
745 l
= (zap_leaf_phys_t
*)stack
;
747 if ((1<<blksft
) < sizeof (zap_leaf_phys_t
))
748 return (ERR_FSYS_CORRUPT
);
749 if (errnum
= dmu_read(zap_dnode
, blkid
, l
, stack
))
752 return (zap_leaf_lookup(l
, blksft
, hash
, name
, value
));
756 * Read in the data of a zap object and find the value for a matching
764 zap_lookup(dnode_phys_t
*zap_dnode
, const char *name
, uint64_t *val
,
771 /* Read in the first block of the zap object data. */
773 size
= zap_dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
776 if ((errnum
= dmu_read(zap_dnode
, 0, zapbuf
, stack
)) != 0)
779 block_type
= *((uint64_t *)zapbuf
);
781 if (block_type
== ZBT_MICRO
) {
782 return (mzap_lookup(zapbuf
, size
, name
, val
));
783 } else if (block_type
== ZBT_HEADER
) {
784 /* this is a fat zap */
785 return (fzap_lookup(zap_dnode
, zapbuf
, name
,
789 return (ERR_FSYS_CORRUPT
);
792 typedef struct zap_attribute
{
793 int za_integer_length
;
794 uint64_t za_num_integers
;
795 uint64_t za_first_integer
;
799 typedef int (zap_cb_t
)(zap_attribute_t
*za
, void *arg
, char *stack
);
802 zap_iterate(dnode_phys_t
*zap_dnode
, zap_cb_t
*cb
, void *arg
, char *stack
)
804 uint32_t size
= zap_dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
807 mzap_phys_t
*mzp
= (mzap_phys_t
*)stack
;
810 if ((errnum
= dmu_read(zap_dnode
, 0, mzp
, stack
)) != 0)
814 * Iteration over fatzap objects has not yet been implemented.
815 * If we encounter a pool in which there are more features for
816 * read than can fit inside a microzap (i.e., more than 2048
817 * features for read), we can add support for fatzap iteration.
820 if (mzp
->mz_block_type
!= ZBT_MICRO
) {
821 grub_printf("feature information stored in fatzap, pool "
822 "version not supported\n");
826 za
.za_integer_length
= 8;
827 za
.za_num_integers
= 1;
828 for (i
= 0; i
< size
/ MZAP_ENT_LEN
- 1; i
++) {
829 mzap_ent_phys_t
*mzep
= &mzp
->mz_chunk
[i
];
832 za
.za_first_integer
= mzep
->mze_value
;
833 za
.za_name
= mzep
->mze_name
;
834 err
= cb(&za
, arg
, stack
);
843 * Get the dnode of an object number from the metadnode of an object set.
846 * mdn - metadnode to get the object dnode
847 * objnum - object number for the object dnode
848 * type - if nonzero, object must be of this type
849 * buf - data buffer that holds the returning dnode
850 * stack - scratch area
857 dnode_get(dnode_phys_t
*mdn
, uint64_t objnum
, uint8_t type
, dnode_phys_t
*buf
,
860 uint64_t blkid
, blksz
; /* the block id this object dnode is in */
861 int epbs
; /* shift of number of dnodes in a block */
862 int idx
; /* index within a block */
865 blksz
= mdn
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
866 epbs
= zfs_log2(blksz
) - DNODE_SHIFT
;
867 blkid
= objnum
>> epbs
;
868 idx
= objnum
& ((1<<epbs
)-1);
870 if (dnode_buf
!= NULL
&& dnode_mdn
== mdn
&&
871 objnum
>= dnode_start
&& objnum
< dnode_end
) {
872 grub_memmove(buf
, &dnode_buf
[idx
], DNODE_SIZE
);
873 VERIFY_DN_TYPE(buf
, type
);
877 if (dnode_buf
&& blksz
== 1<<DNODE_BLOCK_SHIFT
) {
880 dnode_start
= blkid
<< epbs
;
881 dnode_end
= (blkid
+ 1) << epbs
;
883 dnbuf
= (dnode_phys_t
*)stack
;
887 if (errnum
= dmu_read(mdn
, blkid
, (char *)dnbuf
, stack
))
890 grub_memmove(buf
, &dnbuf
[idx
], DNODE_SIZE
);
891 VERIFY_DN_TYPE(buf
, type
);
897 * Check if this is a special file that resides at the top
898 * dataset of the pool. Currently this is the GRUB menu,
899 * boot signature and boot signature backup.
900 * str starts with '/'.
903 is_top_dataset_file(char *str
)
907 if ((tptr
= grub_strstr(str
, "menu.lst")) &&
908 (tptr
[8] == '\0' || tptr
[8] == ' ') &&
912 if (grub_strncmp(str
, BOOTSIGN_DIR
"/",
913 grub_strlen(BOOTSIGN_DIR
) + 1) == 0)
916 if (grub_strcmp(str
, BOOTSIGN_BACKUP
) == 0)
923 check_feature(zap_attribute_t
*za
, void *arg
, char *stack
)
925 const char **names
= arg
;
928 if (za
->za_first_integer
== 0)
931 for (i
= 0; names
[i
] != NULL
; i
++) {
932 if (grub_strcmp(za
->za_name
, names
[i
]) == 0) {
936 grub_printf("missing feature for read '%s'\n", za
->za_name
);
937 return (ERR_NEWER_VERSION
);
941 * Get the file dnode for a given file name where mdn is the meta dnode
942 * for this ZFS object set. When found, place the file dnode in dn.
943 * The 'path' argument will be mangled.
950 dnode_get_path(dnode_phys_t
*mdn
, char *path
, dnode_phys_t
*dn
,
953 uint64_t objnum
, version
;
956 if (errnum
= dnode_get(mdn
, MASTER_NODE_OBJ
, DMU_OT_MASTER_NODE
,
960 if (errnum
= zap_lookup(dn
, ZPL_VERSION_STR
, &version
, stack
))
962 if (version
> ZPL_VERSION
)
965 if (errnum
= zap_lookup(dn
, ZFS_ROOT_OBJ
, &objnum
, stack
))
968 if (errnum
= dnode_get(mdn
, objnum
, DMU_OT_DIRECTORY_CONTENTS
,
972 /* skip leading slashes */
976 while (*path
&& !grub_isspace(*path
)) {
978 /* get the next component name */
980 while (*path
&& !grub_isspace(*path
) && *path
!= '/')
983 *path
= 0; /* ensure null termination */
985 if (errnum
= zap_lookup(dn
, cname
, &objnum
, stack
))
988 objnum
= ZFS_DIRENT_OBJ(objnum
);
989 if (errnum
= dnode_get(mdn
, objnum
, 0, dn
, stack
))
997 /* We found the dnode for this file. Verify if it is a plain file. */
998 VERIFY_DN_TYPE(dn
, DMU_OT_PLAIN_FILE_CONTENTS
);
1004 * Get the default 'bootfs' property value from the rootpool.
1011 get_default_bootfsobj(dnode_phys_t
*mosmdn
, uint64_t *obj
, char *stack
)
1013 uint64_t objnum
= 0;
1014 dnode_phys_t
*dn
= (dnode_phys_t
*)stack
;
1015 stack
+= DNODE_SIZE
;
1017 if (errnum
= dnode_get(mosmdn
, DMU_POOL_DIRECTORY_OBJECT
,
1018 DMU_OT_OBJECT_DIRECTORY
, dn
, stack
))
1022 * find the object number for 'pool_props', and get the dnode
1023 * of the 'pool_props'.
1025 if (zap_lookup(dn
, DMU_POOL_PROPS
, &objnum
, stack
))
1026 return (ERR_FILESYSTEM_NOT_FOUND
);
1028 if (errnum
= dnode_get(mosmdn
, objnum
, DMU_OT_POOL_PROPS
, dn
, stack
))
1031 if (zap_lookup(dn
, ZPOOL_PROP_BOOTFS
, &objnum
, stack
))
1032 return (ERR_FILESYSTEM_NOT_FOUND
);
1035 return (ERR_FILESYSTEM_NOT_FOUND
);
1042 * List of pool features that the grub implementation of ZFS supports for
1043 * read. Note that features that are only required for write do not need
1044 * to be listed here since grub opens pools in read-only mode.
1046 * When this list is updated the version number in usr/src/grub/capability
1047 * must be incremented to ensure the new grub gets installed.
1049 static const char *spa_feature_names
[] = {
1050 "org.illumos:lz4_compress",
1051 "com.delphix:hole_birth",
1052 "com.delphix:extensible_dataset",
1053 "com.delphix:embedded_data",
1054 "org.open-zfs:large_blocks",
1055 "org.illumos:sha512",
1060 * Checks whether the MOS features that are active are supported by this
1061 * (GRUB's) implementation of ZFS.
1068 check_mos_features(dnode_phys_t
*mosmdn
, char *stack
)
1074 dn
= (dnode_phys_t
*)stack
;
1075 stack
+= DNODE_SIZE
;
1077 if ((errnum
= dnode_get(mosmdn
, DMU_POOL_DIRECTORY_OBJECT
,
1078 DMU_OT_OBJECT_DIRECTORY
, dn
, stack
)) != 0)
1082 * Find the object number for 'features_for_read' and retrieve its
1083 * corresponding dnode. Note that we don't check features_for_write
1084 * because GRUB is not opening the pool for write.
1086 if ((errnum
= zap_lookup(dn
, DMU_POOL_FEATURES_FOR_READ
, &objnum
,
1090 if ((errnum
= dnode_get(mosmdn
, objnum
, DMU_OTN_ZAP_METADATA
,
1094 return (zap_iterate(dn
, check_feature
, spa_feature_names
, stack
));
1098 * Given a MOS metadnode, get the metadnode of a given filesystem name (fsname),
1099 * e.g. pool/rootfs, or a given object number (obj), e.g. the object number
1102 * If no fsname and no obj are given, return the DSL_DIR metadnode.
1103 * If fsname is given, return its metadnode and its matching object number.
1104 * If only obj is given, return the metadnode for this object number.
1111 get_objset_mdn(dnode_phys_t
*mosmdn
, char *fsname
, uint64_t *obj
,
1112 dnode_phys_t
*mdn
, char *stack
)
1114 uint64_t objnum
, headobj
;
1121 if (fsname
== NULL
&& obj
) {
1126 if (errnum
= dnode_get(mosmdn
, DMU_POOL_DIRECTORY_OBJECT
,
1127 DMU_OT_OBJECT_DIRECTORY
, mdn
, stack
))
1130 if (errnum
= zap_lookup(mdn
, DMU_POOL_ROOT_DATASET
, &objnum
,
1134 if (errnum
= dnode_get(mosmdn
, objnum
, 0, mdn
, stack
))
1137 if (fsname
== NULL
) {
1139 ((dsl_dir_phys_t
*)DN_BONUS(mdn
))->dd_head_dataset_obj
;
1143 /* take out the pool name */
1144 while (*fsname
&& !grub_isspace(*fsname
) && *fsname
!= '/')
1147 while (*fsname
&& !grub_isspace(*fsname
)) {
1150 while (*fsname
== '/')
1154 while (*fsname
&& !grub_isspace(*fsname
) && *fsname
!= '/')
1160 while (*snapname
&& !grub_isspace(*snapname
) && *snapname
!=
1163 if (*snapname
== '@') {
1168 ((dsl_dir_phys_t
*)DN_BONUS(mdn
))->dd_child_dir_zapobj
;
1169 if (errnum
= dnode_get(mosmdn
, childobj
,
1170 DMU_OT_DSL_DIR_CHILD_MAP
, mdn
, stack
))
1173 if (zap_lookup(mdn
, cname
, &objnum
, stack
))
1174 return (ERR_FILESYSTEM_NOT_FOUND
);
1176 if (errnum
= dnode_get(mosmdn
, objnum
, 0,
1184 headobj
= ((dsl_dir_phys_t
*)DN_BONUS(mdn
))->dd_head_dataset_obj
;
1189 if (errnum
= dnode_get(mosmdn
, headobj
, 0, mdn
, stack
))
1194 snapobj
= ((dsl_dataset_phys_t
*)DN_BONUS(mdn
))->
1195 ds_snapnames_zapobj
;
1197 if (errnum
= dnode_get(mosmdn
, snapobj
,
1198 DMU_OT_DSL_DS_SNAP_MAP
, mdn
, stack
))
1200 if (zap_lookup(mdn
, snapname
+ 1, &headobj
, stack
))
1201 return (ERR_FILESYSTEM_NOT_FOUND
);
1202 if (errnum
= dnode_get(mosmdn
, headobj
, 0, mdn
, stack
))
1208 bp
= &((dsl_dataset_phys_t
*)DN_BONUS(mdn
))->ds_bp
;
1209 osp
= (objset_phys_t
*)stack
;
1210 stack
+= sizeof (objset_phys_t
);
1211 if (errnum
= zio_read(bp
, osp
, stack
))
1214 grub_memmove((char *)mdn
, (char *)&osp
->os_meta_dnode
, DNODE_SIZE
);
1220 * For a given XDR packed nvlist, verify the first 4 bytes and move on.
1222 * An XDR packed nvlist is encoded as (comments from nvs_xdr_create) :
1224 * encoding method/host endian (4 bytes)
1225 * nvl_version (4 bytes)
1226 * nvl_nvflag (4 bytes)
1228 * encoded size of the nvpair (4 bytes)
1229 * decoded size of the nvpair (4 bytes)
1230 * name string size (4 bytes)
1231 * name string data (sizeof(NV_ALIGN4(string))
1232 * data type (4 bytes)
1233 * # of elements in the nvpair (4 bytes)
1235 * 2 zero's for the last nvpair
1236 * (end of the entire list) (8 bytes)
1243 nvlist_unpack(char *nvlist
, char **out
)
1245 /* Verify if the 1st and 2nd byte in the nvlist are valid. */
1246 if (nvlist
[0] != NV_ENCODE_XDR
|| nvlist
[1] != HOST_ENDIAN
)
1254 nvlist_array(char *nvlist
, int index
)
1258 for (i
= 0; i
< index
; i
++) {
1259 /* skip the header, nvl_version, and nvl_nvflag */
1260 nvlist
= nvlist
+ 4 * 2;
1262 while (encode_size
= BSWAP_32(*(uint32_t *)nvlist
))
1263 nvlist
+= encode_size
; /* goto the next nvpair */
1265 nvlist
= nvlist
+ 4 * 2; /* skip the ending 2 zeros - 8 bytes */
1272 * The nvlist_next_nvpair() function returns a handle to the next nvpair in the
1273 * list following nvpair. If nvpair is NULL, the first pair is returned. If
1274 * nvpair is the last pair in the nvlist, NULL is returned.
1277 nvlist_next_nvpair(char *nvl
, char *nvpair
)
1285 if (nvpair
== NULL
) {
1286 /* skip over nvl_version and nvl_nvflag */
1287 nvpair
= nvl
+ 4 * 2;
1289 /* skip to the next nvpair */
1290 encode_size
= BSWAP_32(*(uint32_t *)nvpair
);
1291 nvpair
+= encode_size
;
1294 /* 8 bytes of 0 marks the end of the list */
1295 if (*(uint64_t *)nvpair
== 0)
1302 * This function returns 0 on success and 1 on failure. On success, a string
1303 * containing the name of nvpair is saved in buf.
1306 nvpair_name(char *nvp
, char *buf
, int buflen
)
1310 /* skip over encode/decode size */
1313 len
= BSWAP_32(*(uint32_t *)nvp
);
1314 if (buflen
< len
+ 1)
1317 grub_memmove(buf
, nvp
+ 4, len
);
1324 * This function retrieves the value of the nvpair in the form of enumerated
1325 * type data_type_t. This is used to determine the appropriate type to pass to
1329 nvpair_type(char *nvp
)
1333 /* skip over encode/decode size */
1336 /* skip over name_len */
1337 name_len
= BSWAP_32(*(uint32_t *)nvp
);
1340 /* skip over name */
1341 nvp
= nvp
+ ((name_len
+ 3) & ~3); /* align */
1343 type
= BSWAP_32(*(uint32_t *)nvp
);
1349 nvpair_value(char *nvp
, void *val
, int valtype
, int *nelmp
)
1351 int name_len
, type
, slen
;
1353 uint64_t *intval
= val
;
1355 /* skip over encode/decode size */
1358 /* skip over name_len */
1359 name_len
= BSWAP_32(*(uint32_t *)nvp
);
1362 /* skip over name */
1363 nvp
= nvp
+ ((name_len
+ 3) & ~3); /* align */
1365 /* skip over type */
1366 type
= BSWAP_32(*(uint32_t *)nvp
);
1369 if (type
== valtype
) {
1372 nelm
= BSWAP_32(*(uint32_t *)nvp
);
1373 if (valtype
!= DATA_TYPE_BOOLEAN
&& nelm
< 1)
1378 case DATA_TYPE_BOOLEAN
:
1381 case DATA_TYPE_STRING
:
1382 slen
= BSWAP_32(*(uint32_t *)nvp
);
1384 grub_memmove(strval
, nvp
, slen
);
1385 strval
[slen
] = '\0';
1388 case DATA_TYPE_UINT64
:
1389 *intval
= BSWAP_64(*(uint64_t *)nvp
);
1392 case DATA_TYPE_NVLIST
:
1393 *(void **)val
= (void *)nvp
;
1396 case DATA_TYPE_NVLIST_ARRAY
:
1397 *(void **)val
= (void *)nvp
;
1408 nvlist_lookup_value(char *nvlist
, char *name
, void *val
, int valtype
,
1413 for (nvpair
= nvlist_next_nvpair(nvlist
, NULL
);
1415 nvpair
= nvlist_next_nvpair(nvlist
, nvpair
)) {
1416 int name_len
= BSWAP_32(*(uint32_t *)(nvpair
+ 4 * 2));
1417 char *nvp_name
= nvpair
+ 4 * 3;
1419 if ((grub_strncmp(nvp_name
, name
, name_len
) == 0) &&
1420 nvpair_type(nvpair
) == valtype
) {
1421 return (nvpair_value(nvpair
, val
, valtype
, nelmp
));
1428 * Check if this vdev is online and is in a good state.
1431 vdev_validate(char *nv
)
1435 if (nvlist_lookup_value(nv
, ZPOOL_CONFIG_OFFLINE
, &ival
,
1436 DATA_TYPE_UINT64
, NULL
) == 0 ||
1437 nvlist_lookup_value(nv
, ZPOOL_CONFIG_FAULTED
, &ival
,
1438 DATA_TYPE_UINT64
, NULL
) == 0 ||
1439 nvlist_lookup_value(nv
, ZPOOL_CONFIG_REMOVED
, &ival
,
1440 DATA_TYPE_UINT64
, NULL
) == 0)
1441 return (ERR_DEV_VALUES
);
1447 * Get a valid vdev pathname/devid from the boot device.
1448 * The caller should already allocate MAXPATHLEN memory for bootpath and devid.
1451 vdev_get_bootpath(char *nv
, uint64_t inguid
, char *devid
, char *bootpath
,
1456 if (nvlist_lookup_value(nv
, ZPOOL_CONFIG_TYPE
, &type
, DATA_TYPE_STRING
,
1458 return (ERR_FSYS_CORRUPT
);
1460 if (grub_strcmp(type
, VDEV_TYPE_DISK
) == 0) {
1463 if (vdev_validate(nv
) != 0)
1464 return (ERR_NO_BOOTPATH
);
1466 if (nvlist_lookup_value(nv
, ZPOOL_CONFIG_GUID
,
1467 &guid
, DATA_TYPE_UINT64
, NULL
) != 0)
1468 return (ERR_NO_BOOTPATH
);
1471 return (ERR_NO_BOOTPATH
);
1473 /* for a spare vdev, pick the disk labeled with "is_spare" */
1476 (void) nvlist_lookup_value(nv
, ZPOOL_CONFIG_IS_SPARE
,
1477 &spare
, DATA_TYPE_UINT64
, NULL
);
1479 return (ERR_NO_BOOTPATH
);
1482 if (nvlist_lookup_value(nv
, ZPOOL_CONFIG_PHYS_PATH
,
1483 bootpath
, DATA_TYPE_STRING
, NULL
) != 0)
1486 if (nvlist_lookup_value(nv
, ZPOOL_CONFIG_DEVID
,
1487 devid
, DATA_TYPE_STRING
, NULL
) != 0)
1490 if (grub_strlen(bootpath
) >= MAXPATHLEN
||
1491 grub_strlen(devid
) >= MAXPATHLEN
)
1492 return (ERR_WONT_FIT
);
1496 } else if (grub_strcmp(type
, VDEV_TYPE_MIRROR
) == 0 ||
1497 grub_strcmp(type
, VDEV_TYPE_REPLACING
) == 0 ||
1498 (is_spare
= (grub_strcmp(type
, VDEV_TYPE_SPARE
) == 0))) {
1502 if (nvlist_lookup_value(nv
, ZPOOL_CONFIG_CHILDREN
, &child
,
1503 DATA_TYPE_NVLIST_ARRAY
, &nelm
))
1504 return (ERR_FSYS_CORRUPT
);
1506 for (i
= 0; i
< nelm
; i
++) {
1509 child_i
= nvlist_array(child
, i
);
1510 if (vdev_get_bootpath(child_i
, inguid
, devid
,
1511 bootpath
, is_spare
) == 0)
1516 return (ERR_NO_BOOTPATH
);
1520 * Check the disk label information and retrieve needed vdev name-value pairs.
1527 check_pool_label(uint64_t sector
, char *stack
, char *outdevid
,
1528 char *outpath
, uint64_t *outguid
, uint64_t *outashift
, uint64_t *outversion
)
1531 uint64_t pool_state
, txg
= 0;
1532 char *nvlist
, *nv
, *features
;
1535 sector
+= (VDEV_SKIP_SIZE
>> SPA_MINBLOCKSHIFT
);
1537 /* Read in the vdev name-value pair list (112K). */
1538 if (devread(sector
, 0, VDEV_PHYS_SIZE
, stack
) == 0)
1541 vdev
= (vdev_phys_t
*)stack
;
1542 stack
+= sizeof (vdev_phys_t
);
1544 if (nvlist_unpack(vdev
->vp_nvlist
, &nvlist
))
1545 return (ERR_FSYS_CORRUPT
);
1547 if (nvlist_lookup_value(nvlist
, ZPOOL_CONFIG_POOL_STATE
, &pool_state
,
1548 DATA_TYPE_UINT64
, NULL
))
1549 return (ERR_FSYS_CORRUPT
);
1551 if (pool_state
== POOL_STATE_DESTROYED
)
1552 return (ERR_FILESYSTEM_NOT_FOUND
);
1554 if (nvlist_lookup_value(nvlist
, ZPOOL_CONFIG_POOL_NAME
,
1555 current_rootpool
, DATA_TYPE_STRING
, NULL
))
1556 return (ERR_FSYS_CORRUPT
);
1558 if (nvlist_lookup_value(nvlist
, ZPOOL_CONFIG_POOL_TXG
, &txg
,
1559 DATA_TYPE_UINT64
, NULL
))
1560 return (ERR_FSYS_CORRUPT
);
1562 /* not an active device */
1564 return (ERR_NO_BOOTPATH
);
1566 if (nvlist_lookup_value(nvlist
, ZPOOL_CONFIG_VERSION
, outversion
,
1567 DATA_TYPE_UINT64
, NULL
))
1568 return (ERR_FSYS_CORRUPT
);
1569 if (!SPA_VERSION_IS_SUPPORTED(*outversion
))
1570 return (ERR_NEWER_VERSION
);
1571 if (nvlist_lookup_value(nvlist
, ZPOOL_CONFIG_VDEV_TREE
, &nv
,
1572 DATA_TYPE_NVLIST
, NULL
))
1573 return (ERR_FSYS_CORRUPT
);
1574 if (nvlist_lookup_value(nvlist
, ZPOOL_CONFIG_GUID
, &diskguid
,
1575 DATA_TYPE_UINT64
, NULL
))
1576 return (ERR_FSYS_CORRUPT
);
1577 if (nvlist_lookup_value(nv
, ZPOOL_CONFIG_ASHIFT
, outashift
,
1578 DATA_TYPE_UINT64
, NULL
) != 0)
1579 return (ERR_FSYS_CORRUPT
);
1580 if (vdev_get_bootpath(nv
, diskguid
, outdevid
, outpath
, 0))
1581 return (ERR_NO_BOOTPATH
);
1582 if (nvlist_lookup_value(nvlist
, ZPOOL_CONFIG_POOL_GUID
, outguid
,
1583 DATA_TYPE_UINT64
, NULL
))
1584 return (ERR_FSYS_CORRUPT
);
1586 if (nvlist_lookup_value(nvlist
, ZPOOL_CONFIG_FEATURES_FOR_READ
,
1587 &features
, DATA_TYPE_NVLIST
, NULL
) == 0) {
1590 stack
+= MAXNAMELEN
;
1592 for (nvp
= nvlist_next_nvpair(features
, NULL
);
1594 nvp
= nvlist_next_nvpair(features
, nvp
)) {
1597 if (nvpair_name(nvp
, name
, MAXNAMELEN
) != 0)
1598 return (ERR_FSYS_CORRUPT
);
1600 za
.za_integer_length
= 8;
1601 za
.za_num_integers
= 1;
1602 za
.za_first_integer
= 1;
1604 if (check_feature(&za
, spa_feature_names
, stack
) != 0)
1605 return (ERR_NEWER_VERSION
);
1613 * zfs_mount() locates a valid uberblock of the root pool and read in its MOS
1614 * to the memory address MOS.
1623 char *stack
, *ub_array
;
1625 uberblock_t
*ubbest
;
1627 char tmp_bootpath
[MAXNAMELEN
];
1628 char tmp_devid
[MAXNAMELEN
];
1629 uint64_t tmp_guid
, ashift
, version
;
1630 uint64_t adjpl
= (uint64_t)part_length
<< SPA_MINBLOCKSHIFT
;
1631 int err
= errnum
; /* preserve previous errnum state */
1633 /* if it's our first time here, zero the best uberblock out */
1634 if (best_drive
== 0 && best_part
== 0 && find_best_root
) {
1635 grub_memset(¤t_uberblock
, 0, sizeof (uberblock_t
));
1639 stackbase
= ZFS_SCRATCH
;
1642 stack
+= VDEV_UBERBLOCK_RING
;
1644 osp
= (objset_phys_t
*)stack
;
1645 stack
+= sizeof (objset_phys_t
);
1646 adjpl
= P2ALIGN(adjpl
, (uint64_t)sizeof (vdev_label_t
));
1648 for (label
= 0; label
< VDEV_LABELS
; label
++) {
1651 * some eltorito stacks don't give us a size and
1652 * we end up setting the size to MAXUINT, further
1653 * some of these devices stop working once a single
1654 * read past the end has been issued. Checking
1655 * for a maximum part_length and skipping the backup
1656 * labels at the end of the slice/partition/device
1657 * avoids breaking down on such devices.
1659 if (part_length
== MAXUINT
&& label
== 2)
1662 uint64_t sector
= vdev_label_start(adjpl
,
1663 label
) >> SPA_MINBLOCKSHIFT
;
1665 /* Read in the uberblock ring (128K). */
1666 if (devread(sector
+
1667 ((VDEV_SKIP_SIZE
+ VDEV_PHYS_SIZE
) >> SPA_MINBLOCKSHIFT
),
1668 0, VDEV_UBERBLOCK_RING
, ub_array
) == 0)
1671 if (check_pool_label(sector
, stack
, tmp_devid
,
1672 tmp_bootpath
, &tmp_guid
, &ashift
, &version
))
1676 pool_guid
= tmp_guid
;
1678 if ((ubbest
= find_bestub(ub_array
, ashift
, sector
)) == NULL
||
1679 zio_read(&ubbest
->ub_rootbp
, osp
, stack
) != 0)
1682 VERIFY_OS_TYPE(osp
, DMU_OST_META
);
1684 if (version
>= SPA_VERSION_FEATURES
&&
1685 check_mos_features(&osp
->os_meta_dnode
, stack
) != 0)
1688 if (find_best_root
&& ((pool_guid
!= tmp_guid
) ||
1689 vdev_uberblock_compare(ubbest
, &(current_uberblock
)) <= 0))
1692 /* Got the MOS. Save it at the memory addr MOS. */
1693 grub_memmove(MOS
, &osp
->os_meta_dnode
, DNODE_SIZE
);
1694 grub_memmove(¤t_uberblock
, ubbest
, sizeof (uberblock_t
));
1695 grub_memmove(current_bootpath
, tmp_bootpath
, MAXNAMELEN
);
1696 grub_memmove(current_devid
, tmp_devid
, grub_strlen(tmp_devid
));
1702 * While some fs impls. (tftp) rely on setting and keeping
1703 * global errnums set, others won't reset it and will break
1704 * when issuing rawreads. The goal here is to simply not
1705 * have zfs mount attempts impact the previous state.
1712 * zfs_open() locates a file in the rootpool by following the
1713 * MOS and places the dnode of the file in the memory address DNODE.
1720 zfs_open(char *filename
)
1726 stackbase
= ZFS_SCRATCH
;
1729 mdn
= (dnode_phys_t
*)stack
;
1730 stack
+= sizeof (dnode_phys_t
);
1733 dnode_buf
= (dnode_phys_t
*)stack
;
1734 stack
+= 1<<DNODE_BLOCK_SHIFT
;
1737 * menu.lst is placed at the root pool filesystem level,
1738 * do not goto 'current_bootfs'.
1740 if (is_top_dataset_file(filename
)) {
1741 if (errnum
= get_objset_mdn(MOS
, NULL
, NULL
, mdn
, stack
))
1744 current_bootfs_obj
= 0;
1746 if (current_bootfs
[0] == '\0') {
1747 /* Get the default root filesystem object number */
1748 if (errnum
= get_default_bootfsobj(MOS
,
1749 ¤t_bootfs_obj
, stack
))
1752 if (errnum
= get_objset_mdn(MOS
, NULL
,
1753 ¤t_bootfs_obj
, mdn
, stack
))
1756 if (errnum
= get_objset_mdn(MOS
, current_bootfs
,
1757 ¤t_bootfs_obj
, mdn
, stack
)) {
1758 grub_memset(current_bootfs
, 0, MAXNAMELEN
);
1764 if (dnode_get_path(mdn
, filename
, DNODE
, stack
)) {
1765 errnum
= ERR_FILE_NOT_FOUND
;
1769 /* get the file size and set the file position to 0 */
1772 * For DMU_OT_SA we will need to locate the SIZE attribute
1773 * attribute, which could be either in the bonus buffer
1774 * or the "spill" block.
1776 if (DNODE
->dn_bonustype
== DMU_OT_SA
) {
1777 sa_hdr_phys_t
*sahdrp
;
1780 if (DNODE
->dn_bonuslen
!= 0) {
1781 sahdrp
= (sa_hdr_phys_t
*)DN_BONUS(DNODE
);
1783 if (DNODE
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) {
1784 blkptr_t
*bp
= &DNODE
->dn_spill
;
1787 buf
= (void *)stack
;
1788 stack
+= BP_GET_LSIZE(bp
);
1790 /* reset errnum to rawread() failure */
1792 if (zio_read(bp
, buf
, stack
) != 0) {
1797 errnum
= ERR_FSYS_CORRUPT
;
1801 hdrsize
= SA_HDR_SIZE(sahdrp
);
1802 filemax
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
1805 filemax
= ((znode_phys_t
*)DN_BONUS(DNODE
))->zp_size
;
1814 * zfs_read reads in the data blocks pointed by the DNODE.
1817 * len - the length successfully read in to the buffer
1821 zfs_read(char *buf
, int len
)
1824 int blksz
, length
, movesize
;
1826 if (file_buf
== NULL
) {
1827 file_buf
= stackbase
;
1828 stackbase
+= SPA_MAXBLOCKSIZE
;
1829 file_start
= file_end
= 0;
1834 * If offset is in memory, move it into the buffer provided and return.
1836 if (filepos
>= file_start
&& filepos
+len
<= file_end
) {
1837 grub_memmove(buf
, file_buf
+ filepos
- file_start
, len
);
1842 blksz
= DNODE
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1845 * Note: for GRUB, SPA_MAXBLOCKSIZE is 128KB. There is not enough
1846 * memory to allocate the new max blocksize (16MB), so while
1847 * GRUB understands the large_blocks on-disk feature, it can't
1848 * actually read large blocks.
1850 if (blksz
> SPA_MAXBLOCKSIZE
) {
1851 grub_printf("blocks larger than 128K are not supported\n");
1856 * Entire Dnode is too big to fit into the space available. We
1857 * will need to read it in chunks. This could be optimized to
1858 * read in as large a chunk as there is space available, but for
1859 * now, this only reads in one data block at a time.
1864 * Find requested blkid and the offset within that block.
1866 uint64_t blkid
= filepos
/ blksz
;
1868 if (errnum
= dmu_read(DNODE
, blkid
, file_buf
, stack
))
1871 file_start
= blkid
* blksz
;
1872 file_end
= file_start
+ blksz
;
1874 movesize
= MIN(length
, file_end
- filepos
);
1876 grub_memmove(buf
, file_buf
+ filepos
- file_start
,
1880 filepos
+= movesize
;
1890 zfs_embed(int *start_sector
, int needed_sectors
)
1895 #endif /* FSYS_ZFS */