2 * Copyright (c) 2007 Doug Rabson
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
30 * Stand-alone ZFS file reader.
34 #include <sys/stdint.h>
47 * List of all vdevs, chained through v_alllink.
49 static vdev_list_t zfs_vdevs
;
52 * List of ZFS features supported for read
54 static const char *features_for_read
[] = {
55 "org.illumos:lz4_compress",
56 "com.delphix:hole_birth",
57 "com.delphix:extensible_dataset",
58 "com.delphix:embedded_data",
59 "org.open-zfs:large_blocks",
65 * List of all pools, chained through spa_link.
67 static spa_list_t zfs_pools
;
69 static uint64_t zfs_crc64_table
[256];
70 static const dnode_phys_t
*dnode_cache_obj
= 0;
71 static uint64_t dnode_cache_bn
;
72 static char *dnode_cache_buf
;
73 static char *zap_scratch
;
74 static char *zfs_temp_buf
, *zfs_temp_end
, *zfs_temp_ptr
;
76 #define TEMP_SIZE (1024 * 1024)
78 static int zio_read(const spa_t
*spa
, const blkptr_t
*bp
, void *buf
);
79 static int zfs_get_root(const spa_t
*spa
, uint64_t *objid
);
80 static int zfs_rlookup(const spa_t
*spa
, uint64_t objnum
, char *result
);
85 STAILQ_INIT(&zfs_vdevs
);
86 STAILQ_INIT(&zfs_pools
);
88 zfs_temp_buf
= malloc(TEMP_SIZE
);
89 zfs_temp_end
= zfs_temp_buf
+ TEMP_SIZE
;
90 zfs_temp_ptr
= zfs_temp_buf
;
91 dnode_cache_buf
= malloc(SPA_MAXBLOCKSIZE
);
92 zap_scratch
= malloc(SPA_MAXBLOCKSIZE
);
98 zfs_alloc(size_t size
)
102 if (zfs_temp_ptr
+ size
> zfs_temp_end
) {
103 printf("ZFS: out of temporary buffer space\n");
107 zfs_temp_ptr
+= size
;
113 zfs_free(void *ptr
, size_t size
)
116 zfs_temp_ptr
-= size
;
117 if (zfs_temp_ptr
!= ptr
) {
118 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
124 xdr_int(const unsigned char **xdr
, int *ip
)
126 *ip
= ((*xdr
)[0] << 24)
135 xdr_u_int(const unsigned char **xdr
, u_int
*ip
)
137 *ip
= ((*xdr
)[0] << 24)
146 xdr_uint64_t(const unsigned char **xdr
, uint64_t *lp
)
152 *lp
= (((uint64_t) hi
) << 32) | lo
;
157 nvlist_find(const unsigned char *nvlist
, const char *name
, int type
,
158 int* elementsp
, void *valuep
)
160 const unsigned char *p
, *pair
;
162 int encoded_size
, decoded_size
;
169 xdr_int(&p
, &encoded_size
);
170 xdr_int(&p
, &decoded_size
);
171 while (encoded_size
&& decoded_size
) {
172 int namelen
, pairtype
, elements
;
173 const char *pairname
;
175 xdr_int(&p
, &namelen
);
176 pairname
= (const char*) p
;
177 p
+= roundup(namelen
, 4);
178 xdr_int(&p
, &pairtype
);
180 if (!memcmp(name
, pairname
, namelen
) && type
== pairtype
) {
181 xdr_int(&p
, &elements
);
183 *elementsp
= elements
;
184 if (type
== DATA_TYPE_UINT64
) {
185 xdr_uint64_t(&p
, (uint64_t *) valuep
);
187 } else if (type
== DATA_TYPE_STRING
) {
190 (*(const char**) valuep
) = (const char*) p
;
192 } else if (type
== DATA_TYPE_NVLIST
193 || type
== DATA_TYPE_NVLIST_ARRAY
) {
194 (*(const unsigned char**) valuep
) =
195 (const unsigned char*) p
;
202 * Not the pair we are looking for, skip to the next one.
204 p
= pair
+ encoded_size
;
208 xdr_int(&p
, &encoded_size
);
209 xdr_int(&p
, &decoded_size
);
216 nvlist_check_features_for_read(const unsigned char *nvlist
)
218 const unsigned char *p
, *pair
;
220 int encoded_size
, decoded_size
;
230 xdr_int(&p
, &encoded_size
);
231 xdr_int(&p
, &decoded_size
);
232 while (encoded_size
&& decoded_size
) {
233 int namelen
, pairtype
;
234 const char *pairname
;
239 xdr_int(&p
, &namelen
);
240 pairname
= (const char*) p
;
241 p
+= roundup(namelen
, 4);
242 xdr_int(&p
, &pairtype
);
244 for (i
= 0; features_for_read
[i
] != NULL
; i
++) {
245 if (!memcmp(pairname
, features_for_read
[i
], namelen
)) {
252 printf("ZFS: unsupported feature: %s\n", pairname
);
256 p
= pair
+ encoded_size
;
259 xdr_int(&p
, &encoded_size
);
260 xdr_int(&p
, &decoded_size
);
267 * Return the next nvlist in an nvlist array.
269 static const unsigned char *
270 nvlist_next(const unsigned char *nvlist
)
272 const unsigned char *p
, *pair
;
274 int encoded_size
, decoded_size
;
281 xdr_int(&p
, &encoded_size
);
282 xdr_int(&p
, &decoded_size
);
283 while (encoded_size
&& decoded_size
) {
284 p
= pair
+ encoded_size
;
287 xdr_int(&p
, &encoded_size
);
288 xdr_int(&p
, &decoded_size
);
296 static const unsigned char *
297 nvlist_print(const unsigned char *nvlist
, unsigned int indent
)
299 static const char* typenames
[] = {
310 "DATA_TYPE_BYTE_ARRAY",
311 "DATA_TYPE_INT16_ARRAY",
312 "DATA_TYPE_UINT16_ARRAY",
313 "DATA_TYPE_INT32_ARRAY",
314 "DATA_TYPE_UINT32_ARRAY",
315 "DATA_TYPE_INT64_ARRAY",
316 "DATA_TYPE_UINT64_ARRAY",
317 "DATA_TYPE_STRING_ARRAY",
320 "DATA_TYPE_NVLIST_ARRAY",
321 "DATA_TYPE_BOOLEAN_VALUE",
324 "DATA_TYPE_BOOLEAN_ARRAY",
325 "DATA_TYPE_INT8_ARRAY",
326 "DATA_TYPE_UINT8_ARRAY"
330 const unsigned char *p
, *pair
;
332 int encoded_size
, decoded_size
;
339 xdr_int(&p
, &encoded_size
);
340 xdr_int(&p
, &decoded_size
);
341 while (encoded_size
&& decoded_size
) {
342 int namelen
, pairtype
, elements
;
343 const char *pairname
;
345 xdr_int(&p
, &namelen
);
346 pairname
= (const char*) p
;
347 p
+= roundup(namelen
, 4);
348 xdr_int(&p
, &pairtype
);
350 for (i
= 0; i
< indent
; i
++)
352 printf("%s %s", typenames
[pairtype
], pairname
);
354 xdr_int(&p
, &elements
);
356 case DATA_TYPE_UINT64
: {
358 xdr_uint64_t(&p
, &val
);
359 printf(" = 0x%jx\n", (uintmax_t)val
);
363 case DATA_TYPE_STRING
: {
366 printf(" = \"%s\"\n", p
);
370 case DATA_TYPE_NVLIST
:
372 nvlist_print(p
, indent
+ 1);
375 case DATA_TYPE_NVLIST_ARRAY
:
376 for (j
= 0; j
< elements
; j
++) {
378 p
= nvlist_print(p
, indent
+ 1);
379 if (j
!= elements
- 1) {
380 for (i
= 0; i
< indent
; i
++)
382 printf("%s %s", typenames
[pairtype
], pairname
);
391 p
= pair
+ encoded_size
;
394 xdr_int(&p
, &encoded_size
);
395 xdr_int(&p
, &decoded_size
);
404 vdev_read_phys(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
405 off_t offset
, size_t size
)
410 if (!vdev
->v_phys_read
)
414 psize
= BP_GET_PSIZE(bp
);
419 /*printf("ZFS: reading %d bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
420 rc
= vdev
->v_phys_read(vdev
, vdev
->v_read_priv
, offset
, buf
, psize
);
423 if (bp
&& zio_checksum_verify(bp
, buf
))
430 vdev_disk_read(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
431 off_t offset
, size_t bytes
)
434 return (vdev_read_phys(vdev
, bp
, buf
,
435 offset
+ VDEV_LABEL_START_SIZE
, bytes
));
440 vdev_mirror_read(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
441 off_t offset
, size_t bytes
)
447 STAILQ_FOREACH(kid
, &vdev
->v_children
, v_childlink
) {
448 if (kid
->v_state
!= VDEV_STATE_HEALTHY
)
450 rc
= kid
->v_read(kid
, bp
, buf
, offset
, bytes
);
459 vdev_replacing_read(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
460 off_t offset
, size_t bytes
)
465 * Here we should have two kids:
466 * First one which is the one we are replacing and we can trust
467 * only this one to have valid data, but it might not be present.
468 * Second one is that one we are replacing with. It is most likely
469 * healthy, but we can't trust it has needed data, so we won't use it.
471 kid
= STAILQ_FIRST(&vdev
->v_children
);
474 if (kid
->v_state
!= VDEV_STATE_HEALTHY
)
476 return (kid
->v_read(kid
, bp
, buf
, offset
, bytes
));
480 vdev_find(uint64_t guid
)
484 STAILQ_FOREACH(vdev
, &zfs_vdevs
, v_alllink
)
485 if (vdev
->v_guid
== guid
)
492 vdev_create(uint64_t guid
, vdev_read_t
*vdev_read
)
496 vdev
= malloc(sizeof(vdev_t
));
497 memset(vdev
, 0, sizeof(vdev_t
));
498 STAILQ_INIT(&vdev
->v_children
);
500 vdev
->v_state
= VDEV_STATE_OFFLINE
;
501 vdev
->v_read
= vdev_read
;
502 vdev
->v_phys_read
= 0;
503 vdev
->v_read_priv
= 0;
504 STAILQ_INSERT_TAIL(&zfs_vdevs
, vdev
, v_alllink
);
510 vdev_init_from_nvlist(const unsigned char *nvlist
, vdev_t
*pvdev
,
511 vdev_t
**vdevp
, int is_newer
)
514 uint64_t guid
, id
, ashift
, nparity
;
518 const unsigned char *kids
;
519 int nkids
, i
, is_new
;
520 uint64_t is_offline
, is_faulted
, is_degraded
, is_removed
, isnt_present
;
522 if (nvlist_find(nvlist
, ZPOOL_CONFIG_GUID
,
523 DATA_TYPE_UINT64
, 0, &guid
)
524 || nvlist_find(nvlist
, ZPOOL_CONFIG_ID
,
525 DATA_TYPE_UINT64
, 0, &id
)
526 || nvlist_find(nvlist
, ZPOOL_CONFIG_TYPE
,
527 DATA_TYPE_STRING
, 0, &type
)) {
528 printf("ZFS: can't find vdev details\n");
532 if (strcmp(type
, VDEV_TYPE_MIRROR
)
533 && strcmp(type
, VDEV_TYPE_DISK
)
535 && strcmp(type
, VDEV_TYPE_FILE
)
537 && strcmp(type
, VDEV_TYPE_RAIDZ
)
538 && strcmp(type
, VDEV_TYPE_REPLACING
)) {
539 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
543 is_offline
= is_removed
= is_faulted
= is_degraded
= isnt_present
= 0;
545 nvlist_find(nvlist
, ZPOOL_CONFIG_OFFLINE
, DATA_TYPE_UINT64
, 0,
547 nvlist_find(nvlist
, ZPOOL_CONFIG_REMOVED
, DATA_TYPE_UINT64
, 0,
549 nvlist_find(nvlist
, ZPOOL_CONFIG_FAULTED
, DATA_TYPE_UINT64
, 0,
551 nvlist_find(nvlist
, ZPOOL_CONFIG_DEGRADED
, DATA_TYPE_UINT64
, 0,
553 nvlist_find(nvlist
, ZPOOL_CONFIG_NOT_PRESENT
, DATA_TYPE_UINT64
, 0,
556 vdev
= vdev_find(guid
);
560 if (!strcmp(type
, VDEV_TYPE_MIRROR
))
561 vdev
= vdev_create(guid
, vdev_mirror_read
);
562 else if (!strcmp(type
, VDEV_TYPE_RAIDZ
))
563 vdev
= vdev_create(guid
, vdev_raidz_read
);
564 else if (!strcmp(type
, VDEV_TYPE_REPLACING
))
565 vdev
= vdev_create(guid
, vdev_replacing_read
);
567 vdev
= vdev_create(guid
, vdev_disk_read
);
570 vdev
->v_top
= pvdev
!= NULL
? pvdev
: vdev
;
571 if (nvlist_find(nvlist
, ZPOOL_CONFIG_ASHIFT
,
572 DATA_TYPE_UINT64
, 0, &ashift
) == 0)
573 vdev
->v_ashift
= ashift
;
576 if (nvlist_find(nvlist
, ZPOOL_CONFIG_NPARITY
,
577 DATA_TYPE_UINT64
, 0, &nparity
) == 0)
578 vdev
->v_nparity
= nparity
;
581 if (nvlist_find(nvlist
, ZPOOL_CONFIG_PATH
,
582 DATA_TYPE_STRING
, 0, &path
) == 0) {
583 if (strncmp(path
, "/dev/dsk/", 9) == 0)
585 vdev
->v_name
= strdup(path
);
586 if (nvlist_find(nvlist
, ZPOOL_CONFIG_PHYS_PATH
,
587 DATA_TYPE_STRING
, 0, &path
) == 0)
588 vdev
->v_phys_path
= strdup(path
);
590 vdev
->v_phys_path
= NULL
;
591 if (nvlist_find(nvlist
, ZPOOL_CONFIG_DEVID
,
592 DATA_TYPE_STRING
, 0, &path
) == 0)
593 vdev
->v_devid
= strdup(path
);
595 vdev
->v_devid
= NULL
;
597 if (!strcmp(type
, "raidz")) {
598 if (vdev
->v_nparity
== 1)
599 vdev
->v_name
= "raidz1";
600 else if (vdev
->v_nparity
== 2)
601 vdev
->v_name
= "raidz2";
602 else if (vdev
->v_nparity
== 3)
603 vdev
->v_name
= "raidz3";
605 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
609 vdev
->v_name
= strdup(type
);
616 if (is_new
|| is_newer
) {
618 * This is either new vdev or we've already seen this vdev,
619 * but from an older vdev label, so let's refresh its state
620 * from the newer label.
623 vdev
->v_state
= VDEV_STATE_OFFLINE
;
625 vdev
->v_state
= VDEV_STATE_REMOVED
;
627 vdev
->v_state
= VDEV_STATE_FAULTED
;
628 else if (is_degraded
)
629 vdev
->v_state
= VDEV_STATE_DEGRADED
;
630 else if (isnt_present
)
631 vdev
->v_state
= VDEV_STATE_CANT_OPEN
;
634 rc
= nvlist_find(nvlist
, ZPOOL_CONFIG_CHILDREN
,
635 DATA_TYPE_NVLIST_ARRAY
, &nkids
, &kids
);
637 * Its ok if we don't have any kids.
640 vdev
->v_nchildren
= nkids
;
641 for (i
= 0; i
< nkids
; i
++) {
642 rc
= vdev_init_from_nvlist(kids
, vdev
, &kid
, is_newer
);
646 STAILQ_INSERT_TAIL(&vdev
->v_children
, kid
,
648 kids
= nvlist_next(kids
);
651 vdev
->v_nchildren
= 0;
660 vdev_set_state(vdev_t
*vdev
)
667 * A mirror or raidz is healthy if all its kids are healthy. A
668 * mirror is degraded if any of its kids is healthy; a raidz
669 * is degraded if at most nparity kids are offline.
671 if (STAILQ_FIRST(&vdev
->v_children
)) {
674 STAILQ_FOREACH(kid
, &vdev
->v_children
, v_childlink
) {
675 if (kid
->v_state
== VDEV_STATE_HEALTHY
)
681 vdev
->v_state
= VDEV_STATE_HEALTHY
;
683 if (vdev
->v_read
== vdev_mirror_read
) {
685 vdev
->v_state
= VDEV_STATE_DEGRADED
;
687 vdev
->v_state
= VDEV_STATE_OFFLINE
;
689 } else if (vdev
->v_read
== vdev_raidz_read
) {
690 if (bad_kids
> vdev
->v_nparity
) {
691 vdev
->v_state
= VDEV_STATE_OFFLINE
;
693 vdev
->v_state
= VDEV_STATE_DEGRADED
;
701 spa_find_by_guid(uint64_t guid
)
705 STAILQ_FOREACH(spa
, &zfs_pools
, spa_link
)
706 if (spa
->spa_guid
== guid
)
713 spa_find_by_name(const char *name
)
717 STAILQ_FOREACH(spa
, &zfs_pools
, spa_link
)
718 if (!strcmp(spa
->spa_name
, name
))
725 spa_get_primary(void)
727 return (STAILQ_FIRST(&zfs_pools
));
731 spa_get_primary_vdev(const spa_t
*spa
)
737 spa
= spa_get_primary();
740 vdev
= STAILQ_FIRST(&spa
->spa_vdevs
);
743 for (kid
= STAILQ_FIRST(&vdev
->v_children
); kid
!= NULL
;
744 kid
= STAILQ_FIRST(&vdev
->v_children
))
750 spa_create(uint64_t guid
)
754 spa
= malloc(sizeof(spa_t
));
755 memset(spa
, 0, sizeof(spa_t
));
756 STAILQ_INIT(&spa
->spa_vdevs
);
757 spa
->spa_guid
= guid
;
758 STAILQ_INSERT_TAIL(&zfs_pools
, spa
, spa_link
);
764 state_name(vdev_state_t state
)
766 static const char* names
[] = {
780 pager_printf(const char *fmt
, ...)
786 vsnprintf(line
, sizeof (line
), fmt
, args
);
788 return (pager_output(line
));
791 #define STATUS_FORMAT " %s %s\n"
794 print_state(int indent
, const char *name
, vdev_state_t state
)
800 for (i
= 0; i
< indent
; i
++)
803 return (pager_printf(STATUS_FORMAT
, buf
, state_name(state
)));
807 vdev_status(vdev_t
*vdev
, int indent
)
811 ret
= print_state(indent
, vdev
->v_name
, vdev
->v_state
);
815 STAILQ_FOREACH(kid
, &vdev
->v_children
, v_childlink
) {
816 ret
= vdev_status(kid
, indent
+ 1);
824 spa_status(spa_t
*spa
)
826 static char bootfs
[ZFS_MAXNAMELEN
];
829 int good_kids
, bad_kids
, degraded_kids
, ret
;
832 ret
= pager_printf(" pool: %s\n", spa
->spa_name
);
836 if (zfs_get_root(spa
, &rootid
) == 0 &&
837 zfs_rlookup(spa
, rootid
, bootfs
) == 0) {
838 if (bootfs
[0] == '\0')
839 ret
= pager_printf("bootfs: %s\n", spa
->spa_name
);
841 ret
= pager_printf("bootfs: %s/%s\n", spa
->spa_name
,
846 ret
= pager_printf("config:\n\n");
849 ret
= pager_printf(STATUS_FORMAT
, "NAME", "STATE");
856 STAILQ_FOREACH(vdev
, &spa
->spa_vdevs
, v_childlink
) {
857 if (vdev
->v_state
== VDEV_STATE_HEALTHY
)
859 else if (vdev
->v_state
== VDEV_STATE_DEGRADED
)
865 state
= VDEV_STATE_CLOSED
;
866 if (good_kids
> 0 && (degraded_kids
+ bad_kids
) == 0)
867 state
= VDEV_STATE_HEALTHY
;
868 else if ((good_kids
+ degraded_kids
) > 0)
869 state
= VDEV_STATE_DEGRADED
;
871 ret
= print_state(0, spa
->spa_name
, state
);
874 STAILQ_FOREACH(vdev
, &spa
->spa_vdevs
, v_childlink
) {
875 ret
= vdev_status(vdev
, 1);
886 int first
= 1, ret
= 0;
888 STAILQ_FOREACH(spa
, &zfs_pools
, spa_link
) {
890 ret
= pager_printf("\n");
895 ret
= spa_status(spa
);
903 vdev_probe(vdev_phys_read_t
*phys_read
, void *read_priv
, spa_t
**spap
)
906 vdev_phys_t
*vdev_label
= (vdev_phys_t
*) zap_scratch
;
908 vdev_t
*vdev
, *top_vdev
, *pool_vdev
;
911 const unsigned char *nvlist
;
914 uint64_t pool_txg
, pool_guid
;
916 const char *pool_name
;
917 const unsigned char *vdevs
;
918 const unsigned char *features
;
921 const struct uberblock
*up
;
924 * Load the vdev label and figure out which
925 * uberblock is most current.
927 memset(&vtmp
, 0, sizeof(vtmp
));
928 vtmp
.v_phys_read
= phys_read
;
929 vtmp
.v_read_priv
= read_priv
;
930 off
= offsetof(vdev_label_t
, vl_vdev_phys
);
932 BP_SET_LSIZE(&bp
, sizeof(vdev_phys_t
));
933 BP_SET_PSIZE(&bp
, sizeof(vdev_phys_t
));
934 BP_SET_CHECKSUM(&bp
, ZIO_CHECKSUM_LABEL
);
935 BP_SET_COMPRESS(&bp
, ZIO_COMPRESS_OFF
);
936 DVA_SET_OFFSET(BP_IDENTITY(&bp
), off
);
937 ZIO_SET_CHECKSUM(&bp
.blk_cksum
, off
, 0, 0, 0);
938 if (vdev_read_phys(&vtmp
, &bp
, vdev_label
, off
, 0))
941 if (vdev_label
->vp_nvlist
[0] != NV_ENCODE_XDR
) {
945 nvlist
= (const unsigned char *) vdev_label
->vp_nvlist
+ 4;
947 if (nvlist_find(nvlist
,
948 ZPOOL_CONFIG_VERSION
,
949 DATA_TYPE_UINT64
, 0, &val
)) {
953 if (!SPA_VERSION_IS_SUPPORTED(val
)) {
954 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
955 (unsigned) val
, (unsigned) SPA_VERSION
);
959 /* Check ZFS features for read */
960 if (nvlist_find(nvlist
,
961 ZPOOL_CONFIG_FEATURES_FOR_READ
,
962 DATA_TYPE_NVLIST
, 0, &features
) == 0
963 && nvlist_check_features_for_read(features
) != 0)
966 if (nvlist_find(nvlist
,
967 ZPOOL_CONFIG_POOL_STATE
,
968 DATA_TYPE_UINT64
, 0, &val
)) {
972 if (val
== POOL_STATE_DESTROYED
) {
973 /* We don't boot only from destroyed pools. */
977 if (nvlist_find(nvlist
,
978 ZPOOL_CONFIG_POOL_TXG
,
979 DATA_TYPE_UINT64
, 0, &pool_txg
)
980 || nvlist_find(nvlist
,
981 ZPOOL_CONFIG_POOL_GUID
,
982 DATA_TYPE_UINT64
, 0, &pool_guid
)
983 || nvlist_find(nvlist
,
984 ZPOOL_CONFIG_POOL_NAME
,
985 DATA_TYPE_STRING
, 0, &pool_name
)) {
987 * Cache and spare devices end up here - just ignore
990 /*printf("ZFS: can't find pool details\n");*/
995 (void) nvlist_find(nvlist
, ZPOOL_CONFIG_IS_LOG
, DATA_TYPE_UINT64
, 0,
1001 * Create the pool if this is the first time we've seen it.
1003 spa
= spa_find_by_guid(pool_guid
);
1005 spa
= spa_create(pool_guid
);
1006 spa
->spa_name
= strdup(pool_name
);
1008 if (pool_txg
> spa
->spa_txg
) {
1009 spa
->spa_txg
= pool_txg
;
1015 * Get the vdev tree and create our in-core copy of it.
1016 * If we already have a vdev with this guid, this must
1017 * be some kind of alias (overlapping slices, dangerously dedicated
1020 if (nvlist_find(nvlist
,
1022 DATA_TYPE_UINT64
, 0, &guid
)) {
1025 vdev
= vdev_find(guid
);
1026 if (vdev
&& vdev
->v_phys_read
) /* Has this vdev already been inited? */
1029 if (nvlist_find(nvlist
,
1030 ZPOOL_CONFIG_VDEV_TREE
,
1031 DATA_TYPE_NVLIST
, 0, &vdevs
)) {
1035 rc
= vdev_init_from_nvlist(vdevs
, NULL
, &top_vdev
, is_newer
);
1040 * Add the toplevel vdev to the pool if its not already there.
1042 STAILQ_FOREACH(pool_vdev
, &spa
->spa_vdevs
, v_childlink
)
1043 if (top_vdev
== pool_vdev
)
1045 if (!pool_vdev
&& top_vdev
)
1046 STAILQ_INSERT_TAIL(&spa
->spa_vdevs
, top_vdev
, v_childlink
);
1049 * We should already have created an incomplete vdev for this
1050 * vdev. Find it and initialise it with our read proc.
1052 vdev
= vdev_find(guid
);
1054 vdev
->v_phys_read
= phys_read
;
1055 vdev
->v_read_priv
= read_priv
;
1056 vdev
->v_state
= VDEV_STATE_HEALTHY
;
1058 printf("ZFS: inconsistent nvlist contents\n");
1063 * Re-evaluate top-level vdev state.
1065 vdev_set_state(top_vdev
);
1068 * Ok, we are happy with the pool so far. Lets find
1069 * the best uberblock and then we can actually access
1070 * the contents of the pool.
1072 upbuf
= zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev
));
1073 up
= (const struct uberblock
*)upbuf
;
1075 i
< VDEV_UBERBLOCK_COUNT(vdev
);
1077 off
= VDEV_UBERBLOCK_OFFSET(vdev
, i
);
1079 DVA_SET_OFFSET(&bp
.blk_dva
[0], off
);
1080 BP_SET_LSIZE(&bp
, VDEV_UBERBLOCK_SIZE(vdev
));
1081 BP_SET_PSIZE(&bp
, VDEV_UBERBLOCK_SIZE(vdev
));
1082 BP_SET_CHECKSUM(&bp
, ZIO_CHECKSUM_LABEL
);
1083 BP_SET_COMPRESS(&bp
, ZIO_COMPRESS_OFF
);
1084 ZIO_SET_CHECKSUM(&bp
.blk_cksum
, off
, 0, 0, 0);
1086 if (vdev_read_phys(vdev
, &bp
, upbuf
, off
, 0))
1089 if (up
->ub_magic
!= UBERBLOCK_MAGIC
)
1091 if (up
->ub_txg
< spa
->spa_txg
)
1093 if (up
->ub_txg
> spa
->spa_uberblock
.ub_txg
) {
1094 spa
->spa_uberblock
= *up
;
1095 } else if (up
->ub_txg
== spa
->spa_uberblock
.ub_txg
) {
1096 if (up
->ub_timestamp
> spa
->spa_uberblock
.ub_timestamp
)
1097 spa
->spa_uberblock
= *up
;
1100 zfs_free(upbuf
, VDEV_UBERBLOCK_SIZE(vdev
));
1112 for (v
= 0; v
< 32; v
++)
1119 zio_read_gang(const spa_t
*spa
, const blkptr_t
*bp
, void *buf
)
1122 zio_gbh_phys_t zio_gb
;
1126 /* Artificial BP for gang block header. */
1128 BP_SET_PSIZE(&gbh_bp
, SPA_GANGBLOCKSIZE
);
1129 BP_SET_LSIZE(&gbh_bp
, SPA_GANGBLOCKSIZE
);
1130 BP_SET_CHECKSUM(&gbh_bp
, ZIO_CHECKSUM_GANG_HEADER
);
1131 BP_SET_COMPRESS(&gbh_bp
, ZIO_COMPRESS_OFF
);
1132 for (i
= 0; i
< SPA_DVAS_PER_BP
; i
++)
1133 DVA_SET_GANG(&gbh_bp
.blk_dva
[i
], 0);
1135 /* Read gang header block using the artificial BP. */
1136 if (zio_read(spa
, &gbh_bp
, &zio_gb
))
1140 for (i
= 0; i
< SPA_GBH_NBLKPTRS
; i
++) {
1141 blkptr_t
*gbp
= &zio_gb
.zg_blkptr
[i
];
1143 if (BP_IS_HOLE(gbp
))
1145 if (zio_read(spa
, gbp
, pbuf
))
1147 pbuf
+= BP_GET_PSIZE(gbp
);
1150 if (zio_checksum_verify(bp
, buf
))
1156 zio_read(const spa_t
*spa
, const blkptr_t
*bp
, void *buf
)
1158 int cpfunc
= BP_GET_COMPRESS(bp
);
1159 uint64_t align
, size
;
1164 * Process data embedded in block pointer
1166 if (BP_IS_EMBEDDED(bp
)) {
1167 ASSERT(BPE_GET_ETYPE(bp
) == BP_EMBEDDED_TYPE_DATA
);
1169 size
= BPE_GET_PSIZE(bp
);
1170 ASSERT(size
<= BPE_PAYLOAD_SIZE
);
1172 if (cpfunc
!= ZIO_COMPRESS_OFF
)
1173 pbuf
= zfs_alloc(size
);
1177 decode_embedded_bp_compressed(bp
, pbuf
);
1180 if (cpfunc
!= ZIO_COMPRESS_OFF
) {
1181 error
= zio_decompress_data(cpfunc
, pbuf
,
1182 size
, buf
, BP_GET_LSIZE(bp
));
1183 zfs_free(pbuf
, size
);
1186 printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1193 for (i
= 0; i
< SPA_DVAS_PER_BP
; i
++) {
1194 const dva_t
*dva
= &bp
->blk_dva
[i
];
1199 if (!dva
->dva_word
[0] && !dva
->dva_word
[1])
1202 vdevid
= DVA_GET_VDEV(dva
);
1203 offset
= DVA_GET_OFFSET(dva
);
1204 STAILQ_FOREACH(vdev
, &spa
->spa_vdevs
, v_childlink
) {
1205 if (vdev
->v_id
== vdevid
)
1208 if (!vdev
|| !vdev
->v_read
)
1211 size
= BP_GET_PSIZE(bp
);
1212 if (vdev
->v_read
== vdev_raidz_read
) {
1213 align
= 1ULL << vdev
->v_top
->v_ashift
;
1214 if (P2PHASE(size
, align
) != 0)
1215 size
= P2ROUNDUP(size
, align
);
1217 if (size
!= BP_GET_PSIZE(bp
) || cpfunc
!= ZIO_COMPRESS_OFF
)
1218 pbuf
= zfs_alloc(size
);
1222 if (DVA_GET_GANG(dva
))
1223 error
= zio_read_gang(spa
, bp
, pbuf
);
1225 error
= vdev
->v_read(vdev
, bp
, pbuf
, offset
, size
);
1227 if (cpfunc
!= ZIO_COMPRESS_OFF
)
1228 error
= zio_decompress_data(cpfunc
, pbuf
,
1229 BP_GET_PSIZE(bp
), buf
, BP_GET_LSIZE(bp
));
1230 else if (size
!= BP_GET_PSIZE(bp
))
1231 bcopy(pbuf
, buf
, BP_GET_PSIZE(bp
));
1234 zfs_free(pbuf
, size
);
1239 printf("ZFS: i/o error - all block copies unavailable\n");
1244 dnode_read(const spa_t
*spa
, const dnode_phys_t
*dnode
, off_t offset
, void *buf
, size_t buflen
)
1246 int ibshift
= dnode
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
1247 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1248 int nlevels
= dnode
->dn_nlevels
;
1251 if (bsize
> SPA_MAXBLOCKSIZE
) {
1252 printf("ZFS: I/O error - blocks larger than %llu are not "
1253 "supported\n", SPA_MAXBLOCKSIZE
);
1258 * Note: bsize may not be a power of two here so we need to do an
1259 * actual divide rather than a bitshift.
1261 while (buflen
> 0) {
1262 uint64_t bn
= offset
/ bsize
;
1263 int boff
= offset
% bsize
;
1265 const blkptr_t
*indbp
;
1268 if (bn
> dnode
->dn_maxblkid
) {
1269 printf("warning: zfs bug: bn %llx > dn_maxblkid %llx\n",
1270 (unsigned long long)bn
,
1271 (unsigned long long)dnode
->dn_maxblkid
);
1273 * zfs bug, will not return error
1278 if (dnode
== dnode_cache_obj
&& bn
== dnode_cache_bn
)
1281 indbp
= dnode
->dn_blkptr
;
1282 for (i
= 0; i
< nlevels
; i
++) {
1284 * Copy the bp from the indirect array so that
1285 * we can re-use the scratch buffer for multi-level
1288 ibn
= bn
>> ((nlevels
- i
- 1) * ibshift
);
1289 ibn
&= ((1 << ibshift
) - 1);
1291 if (BP_IS_HOLE(&bp
)) {
1292 memset(dnode_cache_buf
, 0, bsize
);
1295 rc
= zio_read(spa
, &bp
, dnode_cache_buf
);
1298 indbp
= (const blkptr_t
*) dnode_cache_buf
;
1300 dnode_cache_obj
= dnode
;
1301 dnode_cache_bn
= bn
;
1305 * The buffer contains our data block. Copy what we
1306 * need from it and loop.
1309 if (i
> buflen
) i
= buflen
;
1310 memcpy(buf
, &dnode_cache_buf
[boff
], i
);
1311 buf
= ((char*) buf
) + i
;
1320 * Lookup a value in a microzap directory. Assumes that the zap
1321 * scratch buffer contains the directory contents.
1324 mzap_lookup(const dnode_phys_t
*dnode
, const char *name
, uint64_t *value
)
1326 const mzap_phys_t
*mz
;
1327 const mzap_ent_phys_t
*mze
;
1332 * Microzap objects use exactly one block. Read the whole
1335 size
= dnode
->dn_datablkszsec
* 512;
1337 mz
= (const mzap_phys_t
*) zap_scratch
;
1338 chunks
= size
/ MZAP_ENT_LEN
- 1;
1340 for (i
= 0; i
< chunks
; i
++) {
1341 mze
= &mz
->mz_chunk
[i
];
1342 if (!strcmp(mze
->mze_name
, name
)) {
1343 *value
= mze
->mze_value
;
1352 * Compare a name with a zap leaf entry. Return non-zero if the name
1356 fzap_name_equal(const zap_leaf_t
*zl
, const zap_leaf_chunk_t
*zc
, const char *name
)
1359 const zap_leaf_chunk_t
*nc
;
1362 namelen
= zc
->l_entry
.le_name_numints
;
1364 nc
= &ZAP_LEAF_CHUNK(zl
, zc
->l_entry
.le_name_chunk
);
1366 while (namelen
> 0) {
1369 if (len
> ZAP_LEAF_ARRAY_BYTES
)
1370 len
= ZAP_LEAF_ARRAY_BYTES
;
1371 if (memcmp(p
, nc
->l_array
.la_array
, len
))
1375 nc
= &ZAP_LEAF_CHUNK(zl
, nc
->l_array
.la_next
);
1382 * Extract a uint64_t value from a zap leaf entry.
1385 fzap_leaf_value(const zap_leaf_t
*zl
, const zap_leaf_chunk_t
*zc
)
1387 const zap_leaf_chunk_t
*vc
;
1392 vc
= &ZAP_LEAF_CHUNK(zl
, zc
->l_entry
.le_value_chunk
);
1393 for (i
= 0, value
= 0, p
= vc
->l_array
.la_array
; i
< 8; i
++) {
1394 value
= (value
<< 8) | p
[i
];
1401 * Lookup a value in a fatzap directory. Assumes that the zap scratch
1402 * buffer contains the directory header.
1405 fzap_lookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, const char *name
, uint64_t *value
)
1407 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1408 zap_phys_t zh
= *(zap_phys_t
*) zap_scratch
;
1414 if (zh
.zap_magic
!= ZAP_MAGIC
)
1417 z
.zap_block_shift
= ilog2(bsize
);
1418 z
.zap_phys
= (zap_phys_t
*) zap_scratch
;
1421 * Figure out where the pointer table is and read it in if necessary.
1423 if (zh
.zap_ptrtbl
.zt_blk
) {
1424 rc
= dnode_read(spa
, dnode
, zh
.zap_ptrtbl
.zt_blk
* bsize
,
1425 zap_scratch
, bsize
);
1428 ptrtbl
= (uint64_t *) zap_scratch
;
1430 ptrtbl
= &ZAP_EMBEDDED_PTRTBL_ENT(&z
, 0);
1433 hash
= zap_hash(zh
.zap_salt
, name
);
1436 zl
.l_bs
= z
.zap_block_shift
;
1438 off_t off
= ptrtbl
[hash
>> (64 - zh
.zap_ptrtbl
.zt_shift
)] << zl
.l_bs
;
1439 zap_leaf_chunk_t
*zc
;
1441 rc
= dnode_read(spa
, dnode
, off
, zap_scratch
, bsize
);
1445 zl
.l_phys
= (zap_leaf_phys_t
*) zap_scratch
;
1448 * Make sure this chunk matches our hash.
1450 if (zl
.l_phys
->l_hdr
.lh_prefix_len
> 0
1451 && zl
.l_phys
->l_hdr
.lh_prefix
1452 != hash
>> (64 - zl
.l_phys
->l_hdr
.lh_prefix_len
))
1456 * Hash within the chunk to find our entry.
1458 int shift
= (64 - ZAP_LEAF_HASH_SHIFT(&zl
) - zl
.l_phys
->l_hdr
.lh_prefix_len
);
1459 int h
= (hash
>> shift
) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl
)) - 1);
1460 h
= zl
.l_phys
->l_hash
[h
];
1463 zc
= &ZAP_LEAF_CHUNK(&zl
, h
);
1464 while (zc
->l_entry
.le_hash
!= hash
) {
1465 if (zc
->l_entry
.le_next
== 0xffff) {
1469 zc
= &ZAP_LEAF_CHUNK(&zl
, zc
->l_entry
.le_next
);
1471 if (fzap_name_equal(&zl
, zc
, name
)) {
1472 if (zc
->l_entry
.le_value_intlen
* zc
->l_entry
.le_value_numints
> 8)
1474 *value
= fzap_leaf_value(&zl
, zc
);
1482 * Lookup a name in a zap object and return its value as a uint64_t.
1485 zap_lookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, const char *name
, uint64_t *value
)
1489 size_t size
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1491 rc
= dnode_read(spa
, dnode
, 0, zap_scratch
, size
);
1495 zap_type
= *(uint64_t *) zap_scratch
;
1496 if (zap_type
== ZBT_MICRO
)
1497 return mzap_lookup(dnode
, name
, value
);
1498 else if (zap_type
== ZBT_HEADER
)
1499 return fzap_lookup(spa
, dnode
, name
, value
);
1500 printf("ZFS: invalid zap_type=%d\n", (int)zap_type
);
1505 * List a microzap directory. Assumes that the zap scratch buffer contains
1506 * the directory contents.
1509 mzap_list(const dnode_phys_t
*dnode
, int (*callback
)(const char *, uint64_t))
1511 const mzap_phys_t
*mz
;
1512 const mzap_ent_phys_t
*mze
;
1517 * Microzap objects use exactly one block. Read the whole
1520 size
= dnode
->dn_datablkszsec
* 512;
1521 mz
= (const mzap_phys_t
*) zap_scratch
;
1522 chunks
= size
/ MZAP_ENT_LEN
- 1;
1524 for (i
= 0; i
< chunks
; i
++) {
1525 mze
= &mz
->mz_chunk
[i
];
1526 if (mze
->mze_name
[0]) {
1527 rc
= callback(mze
->mze_name
, mze
->mze_value
);
1537 * List a fatzap directory. Assumes that the zap scratch buffer contains
1538 * the directory header.
1541 fzap_list(const spa_t
*spa
, const dnode_phys_t
*dnode
, int (*callback
)(const char *, uint64_t))
1543 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1544 zap_phys_t zh
= *(zap_phys_t
*) zap_scratch
;
1548 if (zh
.zap_magic
!= ZAP_MAGIC
)
1551 z
.zap_block_shift
= ilog2(bsize
);
1552 z
.zap_phys
= (zap_phys_t
*) zap_scratch
;
1555 * This assumes that the leaf blocks start at block 1. The
1556 * documentation isn't exactly clear on this.
1559 zl
.l_bs
= z
.zap_block_shift
;
1560 for (i
= 0; i
< zh
.zap_num_leafs
; i
++) {
1561 off_t off
= (i
+ 1) << zl
.l_bs
;
1565 if (dnode_read(spa
, dnode
, off
, zap_scratch
, bsize
))
1568 zl
.l_phys
= (zap_leaf_phys_t
*) zap_scratch
;
1570 for (j
= 0; j
< ZAP_LEAF_NUMCHUNKS(&zl
); j
++) {
1571 zap_leaf_chunk_t
*zc
, *nc
;
1574 zc
= &ZAP_LEAF_CHUNK(&zl
, j
);
1575 if (zc
->l_entry
.le_type
!= ZAP_CHUNK_ENTRY
)
1577 namelen
= zc
->l_entry
.le_name_numints
;
1578 if (namelen
> sizeof(name
))
1579 namelen
= sizeof(name
);
1582 * Paste the name back together.
1584 nc
= &ZAP_LEAF_CHUNK(&zl
, zc
->l_entry
.le_name_chunk
);
1586 while (namelen
> 0) {
1589 if (len
> ZAP_LEAF_ARRAY_BYTES
)
1590 len
= ZAP_LEAF_ARRAY_BYTES
;
1591 memcpy(p
, nc
->l_array
.la_array
, len
);
1594 nc
= &ZAP_LEAF_CHUNK(&zl
, nc
->l_array
.la_next
);
1598 * Assume the first eight bytes of the value are
1601 value
= fzap_leaf_value(&zl
, zc
);
1603 //printf("%s 0x%jx\n", name, (uintmax_t)value);
1604 rc
= callback((const char *)name
, value
);
1613 static int zfs_printf(const char *name
, uint64_t value __unused
)
1616 printf("%s\n", name
);
1622 * List a zap directory.
1625 zap_list(const spa_t
*spa
, const dnode_phys_t
*dnode
)
1628 size_t size
= dnode
->dn_datablkszsec
* 512;
1630 if (dnode_read(spa
, dnode
, 0, zap_scratch
, size
))
1633 zap_type
= *(uint64_t *) zap_scratch
;
1634 if (zap_type
== ZBT_MICRO
)
1635 return mzap_list(dnode
, zfs_printf
);
1637 return fzap_list(spa
, dnode
, zfs_printf
);
1641 objset_get_dnode(const spa_t
*spa
, const objset_phys_t
*os
, uint64_t objnum
, dnode_phys_t
*dnode
)
1645 offset
= objnum
* sizeof(dnode_phys_t
);
1646 return dnode_read(spa
, &os
->os_meta_dnode
, offset
,
1647 dnode
, sizeof(dnode_phys_t
));
1651 mzap_rlookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, char *name
, uint64_t value
)
1653 const mzap_phys_t
*mz
;
1654 const mzap_ent_phys_t
*mze
;
1659 * Microzap objects use exactly one block. Read the whole
1662 size
= dnode
->dn_datablkszsec
* 512;
1664 mz
= (const mzap_phys_t
*) zap_scratch
;
1665 chunks
= size
/ MZAP_ENT_LEN
- 1;
1667 for (i
= 0; i
< chunks
; i
++) {
1668 mze
= &mz
->mz_chunk
[i
];
1669 if (value
== mze
->mze_value
) {
1670 strcpy(name
, mze
->mze_name
);
1679 fzap_name_copy(const zap_leaf_t
*zl
, const zap_leaf_chunk_t
*zc
, char *name
)
1682 const zap_leaf_chunk_t
*nc
;
1685 namelen
= zc
->l_entry
.le_name_numints
;
1687 nc
= &ZAP_LEAF_CHUNK(zl
, zc
->l_entry
.le_name_chunk
);
1689 while (namelen
> 0) {
1692 if (len
> ZAP_LEAF_ARRAY_BYTES
)
1693 len
= ZAP_LEAF_ARRAY_BYTES
;
1694 memcpy(p
, nc
->l_array
.la_array
, len
);
1697 nc
= &ZAP_LEAF_CHUNK(zl
, nc
->l_array
.la_next
);
1704 fzap_rlookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, char *name
, uint64_t value
)
1706 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1707 zap_phys_t zh
= *(zap_phys_t
*) zap_scratch
;
1711 if (zh
.zap_magic
!= ZAP_MAGIC
)
1714 z
.zap_block_shift
= ilog2(bsize
);
1715 z
.zap_phys
= (zap_phys_t
*) zap_scratch
;
1718 * This assumes that the leaf blocks start at block 1. The
1719 * documentation isn't exactly clear on this.
1722 zl
.l_bs
= z
.zap_block_shift
;
1723 for (i
= 0; i
< zh
.zap_num_leafs
; i
++) {
1724 off_t off
= (i
+ 1) << zl
.l_bs
;
1726 if (dnode_read(spa
, dnode
, off
, zap_scratch
, bsize
))
1729 zl
.l_phys
= (zap_leaf_phys_t
*) zap_scratch
;
1731 for (j
= 0; j
< ZAP_LEAF_NUMCHUNKS(&zl
); j
++) {
1732 zap_leaf_chunk_t
*zc
;
1734 zc
= &ZAP_LEAF_CHUNK(&zl
, j
);
1735 if (zc
->l_entry
.le_type
!= ZAP_CHUNK_ENTRY
)
1737 if (zc
->l_entry
.le_value_intlen
!= 8 ||
1738 zc
->l_entry
.le_value_numints
!= 1)
1741 if (fzap_leaf_value(&zl
, zc
) == value
) {
1742 fzap_name_copy(&zl
, zc
, name
);
1752 zap_rlookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, char *name
, uint64_t value
)
1756 size_t size
= dnode
->dn_datablkszsec
* 512;
1758 rc
= dnode_read(spa
, dnode
, 0, zap_scratch
, size
);
1762 zap_type
= *(uint64_t *) zap_scratch
;
1763 if (zap_type
== ZBT_MICRO
)
1764 return mzap_rlookup(spa
, dnode
, name
, value
);
1766 return fzap_rlookup(spa
, dnode
, name
, value
);
1770 zfs_rlookup(const spa_t
*spa
, uint64_t objnum
, char *result
)
1773 char component
[256];
1774 uint64_t dir_obj
, parent_obj
, child_dir_zapobj
;
1775 dnode_phys_t child_dir_zap
, dataset
, dir
, parent
;
1777 dsl_dataset_phys_t
*ds
;
1781 p
= &name
[sizeof(name
) - 1];
1784 if (objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
)) {
1785 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
1788 ds
= (dsl_dataset_phys_t
*)&dataset
.dn_bonus
;
1789 dir_obj
= ds
->ds_dir_obj
;
1792 if (objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
) != 0)
1794 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
1796 /* Actual loop condition. */
1797 parent_obj
= dd
->dd_parent_obj
;
1798 if (parent_obj
== 0)
1801 if (objset_get_dnode(spa
, &spa
->spa_mos
, parent_obj
, &parent
) != 0)
1803 dd
= (dsl_dir_phys_t
*)&parent
.dn_bonus
;
1804 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
1805 if (objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
) != 0)
1807 if (zap_rlookup(spa
, &child_dir_zap
, component
, dir_obj
) != 0)
1810 len
= strlen(component
);
1812 memcpy(p
, component
, len
);
1816 /* Actual loop iteration. */
1817 dir_obj
= parent_obj
;
1828 zfs_lookup_dataset(const spa_t
*spa
, const char *name
, uint64_t *objnum
)
1831 uint64_t dir_obj
, child_dir_zapobj
;
1832 dnode_phys_t child_dir_zap
, dir
;
1836 if (objset_get_dnode(spa
, &spa
->spa_mos
, DMU_POOL_DIRECTORY_OBJECT
, &dir
))
1838 if (zap_lookup(spa
, &dir
, DMU_POOL_ROOT_DATASET
, &dir_obj
))
1843 if (objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
))
1845 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
1849 /* Actual loop condition #1. */
1855 memcpy(element
, p
, q
- p
);
1856 element
[q
- p
] = '\0';
1863 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
1864 if (objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
) != 0)
1867 /* Actual loop condition #2. */
1868 if (zap_lookup(spa
, &child_dir_zap
, element
, &dir_obj
) != 0)
1872 *objnum
= dd
->dd_head_dataset_obj
;
1876 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
1878 zfs_list_dataset(const spa_t
*spa
, uint64_t objnum
/*, int pos, char *entry*/)
1880 uint64_t dir_obj
, child_dir_zapobj
;
1881 dnode_phys_t child_dir_zap
, dir
, dataset
;
1882 dsl_dataset_phys_t
*ds
;
1885 if (objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
)) {
1886 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
1889 ds
= (dsl_dataset_phys_t
*) &dataset
.dn_bonus
;
1890 dir_obj
= ds
->ds_dir_obj
;
1892 if (objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
)) {
1893 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj
);
1896 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
1898 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
1899 if (objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
) != 0) {
1900 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj
);
1904 return (zap_list(spa
, &child_dir_zap
) != 0);
1908 zfs_callback_dataset(const spa_t
*spa
, uint64_t objnum
, int (*callback
)(const char *, uint64_t))
1910 uint64_t dir_obj
, child_dir_zapobj
, zap_type
;
1911 dnode_phys_t child_dir_zap
, dir
, dataset
;
1912 dsl_dataset_phys_t
*ds
;
1916 err
= objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
);
1918 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
1921 ds
= (dsl_dataset_phys_t
*) &dataset
.dn_bonus
;
1922 dir_obj
= ds
->ds_dir_obj
;
1924 err
= objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
);
1926 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj
);
1929 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
1931 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
1932 err
= objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
);
1934 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj
);
1938 err
= dnode_read(spa
, &child_dir_zap
, 0, zap_scratch
, child_dir_zap
.dn_datablkszsec
* 512);
1942 zap_type
= *(uint64_t *) zap_scratch
;
1943 if (zap_type
== ZBT_MICRO
)
1944 return mzap_list(&child_dir_zap
, callback
);
1946 return fzap_list(spa
, &child_dir_zap
, callback
);
1950 * Find the object set given the object number of its dataset object
1951 * and return its details in *objset
1954 zfs_mount_dataset(const spa_t
*spa
, uint64_t objnum
, objset_phys_t
*objset
)
1956 dnode_phys_t dataset
;
1957 dsl_dataset_phys_t
*ds
;
1959 if (objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
)) {
1960 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
1964 ds
= (dsl_dataset_phys_t
*) &dataset
.dn_bonus
;
1965 if (zio_read(spa
, &ds
->ds_bp
, objset
)) {
1966 printf("ZFS: can't read object set for dataset %ju\n",
1975 * Find the object set pointed to by the BOOTFS property or the root
1976 * dataset if there is none and return its details in *objset
1979 zfs_get_root(const spa_t
*spa
, uint64_t *objid
)
1981 dnode_phys_t dir
, propdir
;
1982 uint64_t props
, bootfs
, root
;
1987 * Start with the MOS directory object.
1989 if (objset_get_dnode(spa
, &spa
->spa_mos
, DMU_POOL_DIRECTORY_OBJECT
, &dir
)) {
1990 printf("ZFS: can't read MOS object directory\n");
1995 * Lookup the pool_props and see if we can find a bootfs.
1997 if (zap_lookup(spa
, &dir
, DMU_POOL_PROPS
, &props
) == 0
1998 && objset_get_dnode(spa
, &spa
->spa_mos
, props
, &propdir
) == 0
1999 && zap_lookup(spa
, &propdir
, "bootfs", &bootfs
) == 0
2006 * Lookup the root dataset directory
2008 if (zap_lookup(spa
, &dir
, DMU_POOL_ROOT_DATASET
, &root
)
2009 || objset_get_dnode(spa
, &spa
->spa_mos
, root
, &dir
)) {
2010 printf("ZFS: can't find root dsl_dir\n");
2015 * Use the information from the dataset directory's bonus buffer
2016 * to find the dataset object and from that the object set itself.
2018 dsl_dir_phys_t
*dd
= (dsl_dir_phys_t
*) &dir
.dn_bonus
;
2019 *objid
= dd
->dd_head_dataset_obj
;
2024 zfs_mount(const spa_t
*spa
, uint64_t rootobj
, struct zfsmount
*mnt
)
2030 * Find the root object set if not explicitly provided
2032 if (rootobj
== 0 && zfs_get_root(spa
, &rootobj
)) {
2033 printf("ZFS: can't find root filesystem\n");
2037 if (zfs_mount_dataset(spa
, rootobj
, &mnt
->objset
)) {
2038 printf("ZFS: can't open root filesystem\n");
2042 mnt
->rootobj
= rootobj
;
2048 * callback function for feature name checks.
2051 check_feature(const char *name
, uint64_t value
)
2057 if (name
[0] == '\0')
2060 for (i
= 0; features_for_read
[i
] != NULL
; i
++) {
2061 if (strcmp(name
, features_for_read
[i
]) == 0)
2064 printf("ZFS: unsupported feature: %s\n", name
);
2069 * Checks whether the MOS features that are active are supported.
2072 check_mos_features(const spa_t
*spa
)
2075 uint64_t objnum
, zap_type
;
2079 if ((rc
= objset_get_dnode(spa
, &spa
->spa_mos
, DMU_OT_OBJECT_DIRECTORY
,
2082 if ((rc
= zap_lookup(spa
, &dir
, DMU_POOL_FEATURES_FOR_READ
, &objnum
)) != 0)
2085 if ((rc
= objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dir
)) != 0)
2088 if (dir
.dn_type
!= DMU_OTN_ZAP_METADATA
)
2091 size
= dir
.dn_datablkszsec
* 512;
2092 if (dnode_read(spa
, &dir
, 0, zap_scratch
, size
))
2095 zap_type
= *(uint64_t *) zap_scratch
;
2096 if (zap_type
== ZBT_MICRO
)
2097 rc
= mzap_list(&dir
, check_feature
);
2099 rc
= fzap_list(spa
, &dir
, check_feature
);
2105 zfs_spa_init(spa_t
*spa
)
2109 if (zio_read(spa
, &spa
->spa_uberblock
.ub_rootbp
, &spa
->spa_mos
)) {
2110 printf("ZFS: can't read MOS of pool %s\n", spa
->spa_name
);
2113 if (spa
->spa_mos
.os_type
!= DMU_OST_META
) {
2114 printf("ZFS: corrupted MOS of pool %s\n", spa
->spa_name
);
2118 rc
= check_mos_features(spa
);
2120 printf("ZFS: pool %s is not supported\n", spa
->spa_name
);
2127 zfs_dnode_stat(const spa_t
*spa
, dnode_phys_t
*dn
, struct stat
*sb
)
2130 if (dn
->dn_bonustype
!= DMU_OT_SA
) {
2131 znode_phys_t
*zp
= (znode_phys_t
*)dn
->dn_bonus
;
2133 sb
->st_mode
= zp
->zp_mode
;
2134 sb
->st_uid
= zp
->zp_uid
;
2135 sb
->st_gid
= zp
->zp_gid
;
2136 sb
->st_size
= zp
->zp_size
;
2138 sa_hdr_phys_t
*sahdrp
;
2143 if (dn
->dn_bonuslen
!= 0)
2144 sahdrp
= (sa_hdr_phys_t
*)DN_BONUS(dn
);
2146 if ((dn
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) != 0) {
2147 blkptr_t
*bp
= &dn
->dn_spill
;
2150 size
= BP_GET_LSIZE(bp
);
2151 buf
= zfs_alloc(size
);
2152 error
= zio_read(spa
, bp
, buf
);
2154 zfs_free(buf
, size
);
2162 hdrsize
= SA_HDR_SIZE(sahdrp
);
2163 sb
->st_mode
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2165 sb
->st_uid
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2167 sb
->st_gid
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2169 sb
->st_size
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2172 zfs_free(buf
, size
);
2179 * Lookup a file and return its dnode.
2182 zfs_lookup(const struct zfsmount
*mnt
, const char *upath
, dnode_phys_t
*dnode
)
2185 uint64_t objnum
, rootnum
, parentnum
;
2191 int symlinks_followed
= 0;
2195 if (mnt
->objset
.os_type
!= DMU_OST_ZFS
) {
2196 printf("ZFS: unexpected object set type %ju\n",
2197 (uintmax_t)mnt
->objset
.os_type
);
2202 * Get the root directory dnode.
2204 rc
= objset_get_dnode(spa
, &mnt
->objset
, MASTER_NODE_OBJ
, &dn
);
2208 rc
= zap_lookup(spa
, &dn
, ZFS_ROOT_OBJ
, &rootnum
);
2212 rc
= objset_get_dnode(spa
, &mnt
->objset
, rootnum
, &dn
);
2225 memcpy(element
, p
, q
- p
);
2233 rc
= zfs_dnode_stat(spa
, &dn
, &sb
);
2236 if (!S_ISDIR(sb
.st_mode
))
2240 rc
= zap_lookup(spa
, &dn
, element
, &objnum
);
2243 objnum
= ZFS_DIRENT_OBJ(objnum
);
2245 rc
= objset_get_dnode(spa
, &mnt
->objset
, objnum
, &dn
);
2250 * Check for symlink.
2252 rc
= zfs_dnode_stat(spa
, &dn
, &sb
);
2255 if (S_ISLNK(sb
.st_mode
)) {
2256 if (symlinks_followed
> 10)
2258 symlinks_followed
++;
2261 * Read the link value and copy the tail of our
2262 * current path onto the end.
2265 strcpy(&path
[sb
.st_size
], p
);
2267 path
[sb
.st_size
] = 0;
2269 * Second test is purely to silence bogus compiler
2270 * warning about accessing past the end of dn_bonus.
2272 if (sb
.st_size
+ sizeof(znode_phys_t
) <=
2273 dn
.dn_bonuslen
&& sizeof(znode_phys_t
) <=
2274 sizeof(dn
.dn_bonus
)) {
2275 memcpy(path
, &dn
.dn_bonus
[sizeof(znode_phys_t
)],
2278 rc
= dnode_read(spa
, &dn
, 0, path
, sb
.st_size
);
2284 * Restart with the new path, starting either at
2285 * the root or at the parent depending whether or
2286 * not the link is relative.
2293 objset_get_dnode(spa
, &mnt
->objset
, objnum
, &dn
);