2 * Copyright (c) 2007 Doug Rabson
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
30 * Stand-alone ZFS file reader.
34 #include <sys/stdint.h>
47 * List of all vdevs, chained through v_alllink.
49 static vdev_list_t zfs_vdevs
;
52 * List of ZFS features supported for read
54 static const char *features_for_read
[] = {
55 "org.illumos:lz4_compress",
56 "com.delphix:hole_birth",
57 "com.delphix:extensible_dataset",
58 "com.delphix:embedded_data",
59 "org.open-zfs:large_blocks",
61 "org.zfsonlinux:large_dnode",
62 "com.joyent:multi_vdev_crash_dump",
67 * List of all pools, chained through spa_link.
69 static spa_list_t zfs_pools
;
71 static const dnode_phys_t
*dnode_cache_obj
;
72 static uint64_t dnode_cache_bn
;
73 static char *dnode_cache_buf
;
74 static char *zap_scratch
;
75 static char *zfs_temp_buf
, *zfs_temp_end
, *zfs_temp_ptr
;
77 #define TEMP_SIZE (1024 * 1024)
79 static int zio_read(const spa_t
*spa
, const blkptr_t
*bp
, void *buf
);
80 static int zfs_get_root(const spa_t
*spa
, uint64_t *objid
);
81 static int zfs_rlookup(const spa_t
*spa
, uint64_t objnum
, char *result
);
86 STAILQ_INIT(&zfs_vdevs
);
87 STAILQ_INIT(&zfs_pools
);
89 zfs_temp_buf
= malloc(TEMP_SIZE
);
90 zfs_temp_end
= zfs_temp_buf
+ TEMP_SIZE
;
91 zfs_temp_ptr
= zfs_temp_buf
;
92 dnode_cache_buf
= malloc(SPA_MAXBLOCKSIZE
);
93 zap_scratch
= malloc(SPA_MAXBLOCKSIZE
);
99 zfs_alloc(size_t size
)
103 if (zfs_temp_ptr
+ size
> zfs_temp_end
) {
104 printf("ZFS: out of temporary buffer space\n");
108 zfs_temp_ptr
+= size
;
114 zfs_free(void *ptr
, size_t size
)
117 zfs_temp_ptr
-= size
;
118 if (zfs_temp_ptr
!= ptr
) {
119 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
125 xdr_int(const unsigned char **xdr
, int *ip
)
127 *ip
= ((*xdr
)[0] << 24)
136 xdr_u_int(const unsigned char **xdr
, u_int
*ip
)
138 *ip
= ((*xdr
)[0] << 24)
147 xdr_uint64_t(const unsigned char **xdr
, uint64_t *lp
)
153 *lp
= (((uint64_t) hi
) << 32) | lo
;
158 nvlist_find(const unsigned char *nvlist
, const char *name
, int type
,
159 int* elementsp
, void *valuep
)
161 const unsigned char *p
, *pair
;
163 int encoded_size
, decoded_size
;
170 xdr_int(&p
, &encoded_size
);
171 xdr_int(&p
, &decoded_size
);
172 while (encoded_size
&& decoded_size
) {
173 int namelen
, pairtype
, elements
;
174 const char *pairname
;
176 xdr_int(&p
, &namelen
);
177 pairname
= (const char*) p
;
178 p
+= roundup(namelen
, 4);
179 xdr_int(&p
, &pairtype
);
181 if (!memcmp(name
, pairname
, namelen
) && type
== pairtype
) {
182 xdr_int(&p
, &elements
);
184 *elementsp
= elements
;
185 if (type
== DATA_TYPE_UINT64
) {
186 xdr_uint64_t(&p
, (uint64_t *) valuep
);
188 } else if (type
== DATA_TYPE_STRING
) {
191 (*(const char**) valuep
) = (const char*) p
;
193 } else if (type
== DATA_TYPE_NVLIST
194 || type
== DATA_TYPE_NVLIST_ARRAY
) {
195 (*(const unsigned char**) valuep
) =
196 (const unsigned char*) p
;
203 * Not the pair we are looking for, skip to the next one.
205 p
= pair
+ encoded_size
;
209 xdr_int(&p
, &encoded_size
);
210 xdr_int(&p
, &decoded_size
);
217 nvlist_check_features_for_read(const unsigned char *nvlist
)
219 const unsigned char *p
, *pair
;
221 int encoded_size
, decoded_size
;
231 xdr_int(&p
, &encoded_size
);
232 xdr_int(&p
, &decoded_size
);
233 while (encoded_size
&& decoded_size
) {
234 int namelen
, pairtype
;
235 const char *pairname
;
240 xdr_int(&p
, &namelen
);
241 pairname
= (const char*) p
;
242 p
+= roundup(namelen
, 4);
243 xdr_int(&p
, &pairtype
);
245 for (i
= 0; features_for_read
[i
] != NULL
; i
++) {
246 if (!memcmp(pairname
, features_for_read
[i
], namelen
)) {
253 printf("ZFS: unsupported feature: %s\n", pairname
);
257 p
= pair
+ encoded_size
;
260 xdr_int(&p
, &encoded_size
);
261 xdr_int(&p
, &decoded_size
);
268 * Return the next nvlist in an nvlist array.
270 static const unsigned char *
271 nvlist_next(const unsigned char *nvlist
)
273 const unsigned char *p
, *pair
;
275 int encoded_size
, decoded_size
;
282 xdr_int(&p
, &encoded_size
);
283 xdr_int(&p
, &decoded_size
);
284 while (encoded_size
&& decoded_size
) {
285 p
= pair
+ encoded_size
;
288 xdr_int(&p
, &encoded_size
);
289 xdr_int(&p
, &decoded_size
);
297 static const unsigned char *
298 nvlist_print(const unsigned char *nvlist
, unsigned int indent
)
300 static const char* typenames
[] = {
311 "DATA_TYPE_BYTE_ARRAY",
312 "DATA_TYPE_INT16_ARRAY",
313 "DATA_TYPE_UINT16_ARRAY",
314 "DATA_TYPE_INT32_ARRAY",
315 "DATA_TYPE_UINT32_ARRAY",
316 "DATA_TYPE_INT64_ARRAY",
317 "DATA_TYPE_UINT64_ARRAY",
318 "DATA_TYPE_STRING_ARRAY",
321 "DATA_TYPE_NVLIST_ARRAY",
322 "DATA_TYPE_BOOLEAN_VALUE",
325 "DATA_TYPE_BOOLEAN_ARRAY",
326 "DATA_TYPE_INT8_ARRAY",
327 "DATA_TYPE_UINT8_ARRAY"
331 const unsigned char *p
, *pair
;
333 int encoded_size
, decoded_size
;
340 xdr_int(&p
, &encoded_size
);
341 xdr_int(&p
, &decoded_size
);
342 while (encoded_size
&& decoded_size
) {
343 int namelen
, pairtype
, elements
;
344 const char *pairname
;
346 xdr_int(&p
, &namelen
);
347 pairname
= (const char*) p
;
348 p
+= roundup(namelen
, 4);
349 xdr_int(&p
, &pairtype
);
351 for (i
= 0; i
< indent
; i
++)
353 printf("%s %s", typenames
[pairtype
], pairname
);
355 xdr_int(&p
, &elements
);
357 case DATA_TYPE_UINT64
: {
359 xdr_uint64_t(&p
, &val
);
360 printf(" = 0x%jx\n", (uintmax_t)val
);
364 case DATA_TYPE_STRING
: {
367 printf(" = \"%s\"\n", p
);
371 case DATA_TYPE_NVLIST
:
373 nvlist_print(p
, indent
+ 1);
376 case DATA_TYPE_NVLIST_ARRAY
:
377 for (j
= 0; j
< elements
; j
++) {
379 p
= nvlist_print(p
, indent
+ 1);
380 if (j
!= elements
- 1) {
381 for (i
= 0; i
< indent
; i
++)
383 printf("%s %s", typenames
[pairtype
], pairname
);
392 p
= pair
+ encoded_size
;
395 xdr_int(&p
, &encoded_size
);
396 xdr_int(&p
, &decoded_size
);
405 vdev_read_phys(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
406 off_t offset
, size_t size
)
411 if (!vdev
->v_phys_read
)
415 psize
= BP_GET_PSIZE(bp
);
420 /*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
421 rc
= vdev
->v_phys_read(vdev
, vdev
->v_read_priv
, offset
, buf
, psize
);
424 if (bp
&& zio_checksum_verify(bp
, buf
))
431 vdev_disk_read(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
432 off_t offset
, size_t bytes
)
435 return (vdev_read_phys(vdev
, bp
, buf
,
436 offset
+ VDEV_LABEL_START_SIZE
, bytes
));
441 vdev_mirror_read(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
442 off_t offset
, size_t bytes
)
448 STAILQ_FOREACH(kid
, &vdev
->v_children
, v_childlink
) {
449 if (kid
->v_state
!= VDEV_STATE_HEALTHY
)
451 rc
= kid
->v_read(kid
, bp
, buf
, offset
, bytes
);
460 vdev_replacing_read(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
461 off_t offset
, size_t bytes
)
466 * Here we should have two kids:
467 * First one which is the one we are replacing and we can trust
468 * only this one to have valid data, but it might not be present.
469 * Second one is that one we are replacing with. It is most likely
470 * healthy, but we can't trust it has needed data, so we won't use it.
472 kid
= STAILQ_FIRST(&vdev
->v_children
);
475 if (kid
->v_state
!= VDEV_STATE_HEALTHY
)
477 return (kid
->v_read(kid
, bp
, buf
, offset
, bytes
));
481 vdev_find(uint64_t guid
)
485 STAILQ_FOREACH(vdev
, &zfs_vdevs
, v_alllink
)
486 if (vdev
->v_guid
== guid
)
493 vdev_create(uint64_t guid
, vdev_read_t
*vdev_read
)
497 vdev
= malloc(sizeof(vdev_t
));
498 memset(vdev
, 0, sizeof(vdev_t
));
499 STAILQ_INIT(&vdev
->v_children
);
501 vdev
->v_state
= VDEV_STATE_OFFLINE
;
502 vdev
->v_read
= vdev_read
;
503 vdev
->v_phys_read
= 0;
504 vdev
->v_read_priv
= 0;
505 STAILQ_INSERT_TAIL(&zfs_vdevs
, vdev
, v_alllink
);
511 vdev_init_from_nvlist(const unsigned char *nvlist
, vdev_t
*pvdev
,
512 vdev_t
**vdevp
, int is_newer
)
515 uint64_t guid
, id
, ashift
, nparity
;
519 const unsigned char *kids
;
520 int nkids
, i
, is_new
;
521 uint64_t is_offline
, is_faulted
, is_degraded
, is_removed
, isnt_present
;
523 if (nvlist_find(nvlist
, ZPOOL_CONFIG_GUID
, DATA_TYPE_UINT64
,
525 nvlist_find(nvlist
, ZPOOL_CONFIG_ID
, DATA_TYPE_UINT64
, NULL
, &id
) ||
526 nvlist_find(nvlist
, ZPOOL_CONFIG_TYPE
, DATA_TYPE_STRING
,
528 printf("ZFS: can't find vdev details\n");
532 if (strcmp(type
, VDEV_TYPE_MIRROR
)
533 && strcmp(type
, VDEV_TYPE_DISK
)
535 && strcmp(type
, VDEV_TYPE_FILE
)
537 && strcmp(type
, VDEV_TYPE_RAIDZ
)
538 && strcmp(type
, VDEV_TYPE_REPLACING
)) {
539 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
543 is_offline
= is_removed
= is_faulted
= is_degraded
= isnt_present
= 0;
545 nvlist_find(nvlist
, ZPOOL_CONFIG_OFFLINE
, DATA_TYPE_UINT64
, NULL
,
547 nvlist_find(nvlist
, ZPOOL_CONFIG_REMOVED
, DATA_TYPE_UINT64
, NULL
,
549 nvlist_find(nvlist
, ZPOOL_CONFIG_FAULTED
, DATA_TYPE_UINT64
, NULL
,
551 nvlist_find(nvlist
, ZPOOL_CONFIG_DEGRADED
, DATA_TYPE_UINT64
, NULL
,
553 nvlist_find(nvlist
, ZPOOL_CONFIG_NOT_PRESENT
, DATA_TYPE_UINT64
, NULL
,
556 vdev
= vdev_find(guid
);
560 if (!strcmp(type
, VDEV_TYPE_MIRROR
))
561 vdev
= vdev_create(guid
, vdev_mirror_read
);
562 else if (!strcmp(type
, VDEV_TYPE_RAIDZ
))
563 vdev
= vdev_create(guid
, vdev_raidz_read
);
564 else if (!strcmp(type
, VDEV_TYPE_REPLACING
))
565 vdev
= vdev_create(guid
, vdev_replacing_read
);
567 vdev
= vdev_create(guid
, vdev_disk_read
);
570 vdev
->v_top
= pvdev
!= NULL
? pvdev
: vdev
;
571 if (nvlist_find(nvlist
, ZPOOL_CONFIG_ASHIFT
,
572 DATA_TYPE_UINT64
, NULL
, &ashift
) == 0) {
573 vdev
->v_ashift
= ashift
;
577 if (nvlist_find(nvlist
, ZPOOL_CONFIG_NPARITY
,
578 DATA_TYPE_UINT64
, NULL
, &nparity
) == 0) {
579 vdev
->v_nparity
= nparity
;
583 if (nvlist_find(nvlist
, ZPOOL_CONFIG_PATH
,
584 DATA_TYPE_STRING
, NULL
, &path
) == 0) {
585 if (strncmp(path
, "/dev/dsk/", 9) == 0)
587 vdev
->v_name
= strdup(path
);
588 if (nvlist_find(nvlist
, ZPOOL_CONFIG_PHYS_PATH
,
589 DATA_TYPE_STRING
, NULL
, &path
) == 0) {
590 vdev
->v_phys_path
= strdup(path
);
592 vdev
->v_phys_path
= NULL
;
594 if (nvlist_find(nvlist
, ZPOOL_CONFIG_DEVID
,
595 DATA_TYPE_STRING
, NULL
, &path
) == 0) {
596 vdev
->v_devid
= strdup(path
);
598 vdev
->v_devid
= NULL
;
601 if (!strcmp(type
, "raidz")) {
602 if (vdev
->v_nparity
== 1)
603 vdev
->v_name
= "raidz1";
604 else if (vdev
->v_nparity
== 2)
605 vdev
->v_name
= "raidz2";
606 else if (vdev
->v_nparity
== 3)
607 vdev
->v_name
= "raidz3";
609 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
613 vdev
->v_name
= strdup(type
);
620 if (is_new
|| is_newer
) {
622 * This is either new vdev or we've already seen this vdev,
623 * but from an older vdev label, so let's refresh its state
624 * from the newer label.
627 vdev
->v_state
= VDEV_STATE_OFFLINE
;
629 vdev
->v_state
= VDEV_STATE_REMOVED
;
631 vdev
->v_state
= VDEV_STATE_FAULTED
;
632 else if (is_degraded
)
633 vdev
->v_state
= VDEV_STATE_DEGRADED
;
634 else if (isnt_present
)
635 vdev
->v_state
= VDEV_STATE_CANT_OPEN
;
638 rc
= nvlist_find(nvlist
, ZPOOL_CONFIG_CHILDREN
, DATA_TYPE_NVLIST_ARRAY
,
641 * Its ok if we don't have any kids.
644 vdev
->v_nchildren
= nkids
;
645 for (i
= 0; i
< nkids
; i
++) {
646 rc
= vdev_init_from_nvlist(kids
, vdev
, &kid
, is_newer
);
650 STAILQ_INSERT_TAIL(&vdev
->v_children
, kid
,
652 kids
= nvlist_next(kids
);
655 vdev
->v_nchildren
= 0;
664 vdev_set_state(vdev_t
*vdev
)
671 * A mirror or raidz is healthy if all its kids are healthy. A
672 * mirror is degraded if any of its kids is healthy; a raidz
673 * is degraded if at most nparity kids are offline.
675 if (STAILQ_FIRST(&vdev
->v_children
)) {
678 STAILQ_FOREACH(kid
, &vdev
->v_children
, v_childlink
) {
679 if (kid
->v_state
== VDEV_STATE_HEALTHY
)
685 vdev
->v_state
= VDEV_STATE_HEALTHY
;
687 if (vdev
->v_read
== vdev_mirror_read
) {
689 vdev
->v_state
= VDEV_STATE_DEGRADED
;
691 vdev
->v_state
= VDEV_STATE_OFFLINE
;
693 } else if (vdev
->v_read
== vdev_raidz_read
) {
694 if (bad_kids
> vdev
->v_nparity
) {
695 vdev
->v_state
= VDEV_STATE_OFFLINE
;
697 vdev
->v_state
= VDEV_STATE_DEGRADED
;
705 spa_find_by_guid(uint64_t guid
)
709 STAILQ_FOREACH(spa
, &zfs_pools
, spa_link
)
710 if (spa
->spa_guid
== guid
)
717 spa_find_by_name(const char *name
)
721 STAILQ_FOREACH(spa
, &zfs_pools
, spa_link
)
722 if (!strcmp(spa
->spa_name
, name
))
729 spa_get_primary(void)
731 return (STAILQ_FIRST(&zfs_pools
));
735 spa_get_primary_vdev(const spa_t
*spa
)
741 spa
= spa_get_primary();
744 vdev
= STAILQ_FIRST(&spa
->spa_vdevs
);
747 for (kid
= STAILQ_FIRST(&vdev
->v_children
); kid
!= NULL
;
748 kid
= STAILQ_FIRST(&vdev
->v_children
))
754 spa_create(uint64_t guid
, const char *name
)
758 if ((spa
= malloc(sizeof(spa_t
))) == NULL
)
760 memset(spa
, 0, sizeof(spa_t
));
761 if ((spa
->spa_name
= strdup(name
)) == NULL
) {
765 STAILQ_INIT(&spa
->spa_vdevs
);
766 spa
->spa_guid
= guid
;
767 STAILQ_INSERT_TAIL(&zfs_pools
, spa
, spa_link
);
773 state_name(vdev_state_t state
)
775 static const char* names
[] = {
789 pager_printf(const char *fmt
, ...)
795 vsnprintf(line
, sizeof (line
), fmt
, args
);
797 return (pager_output(line
));
800 #define STATUS_FORMAT " %s %s\n"
803 print_state(int indent
, const char *name
, vdev_state_t state
)
809 for (i
= 0; i
< indent
; i
++)
812 return (pager_printf(STATUS_FORMAT
, buf
, state_name(state
)));
816 vdev_status(vdev_t
*vdev
, int indent
)
820 ret
= print_state(indent
, vdev
->v_name
, vdev
->v_state
);
824 STAILQ_FOREACH(kid
, &vdev
->v_children
, v_childlink
) {
825 ret
= vdev_status(kid
, indent
+ 1);
833 spa_status(spa_t
*spa
)
835 static char bootfs
[ZFS_MAXNAMELEN
];
838 int good_kids
, bad_kids
, degraded_kids
, ret
;
841 ret
= pager_printf(" pool: %s\n", spa
->spa_name
);
845 if (zfs_get_root(spa
, &rootid
) == 0 &&
846 zfs_rlookup(spa
, rootid
, bootfs
) == 0) {
847 if (bootfs
[0] == '\0')
848 ret
= pager_printf("bootfs: %s\n", spa
->spa_name
);
850 ret
= pager_printf("bootfs: %s/%s\n", spa
->spa_name
,
855 ret
= pager_printf("config:\n\n");
858 ret
= pager_printf(STATUS_FORMAT
, "NAME", "STATE");
865 STAILQ_FOREACH(vdev
, &spa
->spa_vdevs
, v_childlink
) {
866 if (vdev
->v_state
== VDEV_STATE_HEALTHY
)
868 else if (vdev
->v_state
== VDEV_STATE_DEGRADED
)
874 state
= VDEV_STATE_CLOSED
;
875 if (good_kids
> 0 && (degraded_kids
+ bad_kids
) == 0)
876 state
= VDEV_STATE_HEALTHY
;
877 else if ((good_kids
+ degraded_kids
) > 0)
878 state
= VDEV_STATE_DEGRADED
;
880 ret
= print_state(0, spa
->spa_name
, state
);
883 STAILQ_FOREACH(vdev
, &spa
->spa_vdevs
, v_childlink
) {
884 ret
= vdev_status(vdev
, 1);
895 int first
= 1, ret
= 0;
897 STAILQ_FOREACH(spa
, &zfs_pools
, spa_link
) {
899 ret
= pager_printf("\n");
904 ret
= spa_status(spa
);
912 vdev_label_offset(uint64_t psize
, int l
, uint64_t offset
)
914 uint64_t label_offset
;
916 if (l
< VDEV_LABELS
/ 2)
919 label_offset
= psize
- VDEV_LABELS
* sizeof (vdev_label_t
);
921 return (offset
+ l
* sizeof (vdev_label_t
) + label_offset
);
925 vdev_probe(vdev_phys_read_t
*phys_read
, void *read_priv
, spa_t
**spap
)
928 vdev_phys_t
*vdev_label
= (vdev_phys_t
*) zap_scratch
;
929 vdev_phys_t
*tmp_label
;
931 vdev_t
*vdev
, *top_vdev
, *pool_vdev
;
934 const unsigned char *nvlist
= NULL
;
937 uint64_t best_txg
= 0;
938 uint64_t pool_txg
, pool_guid
;
940 const char *pool_name
;
941 const unsigned char *vdevs
;
942 const unsigned char *features
;
943 int i
, l
, rc
, is_newer
;
945 const struct uberblock
*up
;
948 * Load the vdev label and figure out which
949 * uberblock is most current.
951 memset(&vtmp
, 0, sizeof(vtmp
));
952 vtmp
.v_phys_read
= phys_read
;
953 vtmp
.v_read_priv
= read_priv
;
954 psize
= P2ALIGN(ldi_get_size(read_priv
),
955 (uint64_t)sizeof (vdev_label_t
));
957 /* Test for minimum device size. */
958 if (psize
< SPA_MINDEVSIZE
)
961 tmp_label
= zfs_alloc(sizeof (vdev_phys_t
));
963 for (l
= 0; l
< VDEV_LABELS
; l
++) {
964 off
= vdev_label_offset(psize
, l
,
965 offsetof(vdev_label_t
, vl_vdev_phys
));
968 BP_SET_LSIZE(&bp
, sizeof(vdev_phys_t
));
969 BP_SET_PSIZE(&bp
, sizeof(vdev_phys_t
));
970 BP_SET_CHECKSUM(&bp
, ZIO_CHECKSUM_LABEL
);
971 BP_SET_COMPRESS(&bp
, ZIO_COMPRESS_OFF
);
972 DVA_SET_OFFSET(BP_IDENTITY(&bp
), off
);
973 ZIO_SET_CHECKSUM(&bp
.blk_cksum
, off
, 0, 0, 0);
975 if (vdev_read_phys(&vtmp
, &bp
, tmp_label
, off
, 0))
978 if (tmp_label
->vp_nvlist
[0] != NV_ENCODE_XDR
)
981 nvlist
= (const unsigned char *) tmp_label
->vp_nvlist
+ 4;
982 if (nvlist_find(nvlist
, ZPOOL_CONFIG_POOL_TXG
,
983 DATA_TYPE_UINT64
, NULL
, &pool_txg
) != 0)
986 if (best_txg
<= pool_txg
) {
988 memcpy(vdev_label
, tmp_label
, sizeof (vdev_phys_t
));
992 zfs_free(tmp_label
, sizeof (vdev_phys_t
));
997 if (vdev_label
->vp_nvlist
[0] != NV_ENCODE_XDR
)
1000 nvlist
= (const unsigned char *) vdev_label
->vp_nvlist
+ 4;
1002 if (nvlist_find(nvlist
, ZPOOL_CONFIG_VERSION
, DATA_TYPE_UINT64
,
1007 if (!SPA_VERSION_IS_SUPPORTED(val
)) {
1008 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
1009 (unsigned) val
, (unsigned) SPA_VERSION
);
1013 /* Check ZFS features for read */
1014 if (nvlist_find(nvlist
, ZPOOL_CONFIG_FEATURES_FOR_READ
,
1015 DATA_TYPE_NVLIST
, NULL
, &features
) == 0 &&
1016 nvlist_check_features_for_read(features
) != 0) {
1020 if (nvlist_find(nvlist
, ZPOOL_CONFIG_POOL_STATE
, DATA_TYPE_UINT64
,
1025 if (val
== POOL_STATE_DESTROYED
) {
1026 /* We don't boot only from destroyed pools. */
1030 if (nvlist_find(nvlist
, ZPOOL_CONFIG_POOL_TXG
, DATA_TYPE_UINT64
,
1031 NULL
, &pool_txg
) != 0 ||
1032 nvlist_find(nvlist
, ZPOOL_CONFIG_POOL_GUID
, DATA_TYPE_UINT64
,
1033 NULL
, &pool_guid
) != 0 ||
1034 nvlist_find(nvlist
, ZPOOL_CONFIG_POOL_NAME
, DATA_TYPE_STRING
,
1035 NULL
, &pool_name
) != 0) {
1037 * Cache and spare devices end up here - just ignore
1040 /*printf("ZFS: can't find pool details\n");*/
1044 if (nvlist_find(nvlist
, ZPOOL_CONFIG_IS_LOG
, DATA_TYPE_UINT64
,
1045 NULL
, &val
) == 0 && val
!= 0) {
1050 * Create the pool if this is the first time we've seen it.
1052 spa
= spa_find_by_guid(pool_guid
);
1054 spa
= spa_create(pool_guid
, pool_name
);
1058 if (pool_txg
> spa
->spa_txg
) {
1059 spa
->spa_txg
= pool_txg
;
1066 * Get the vdev tree and create our in-core copy of it.
1067 * If we already have a vdev with this guid, this must
1068 * be some kind of alias (overlapping slices, dangerously dedicated
1071 if (nvlist_find(nvlist
, ZPOOL_CONFIG_GUID
, DATA_TYPE_UINT64
,
1072 NULL
, &guid
) != 0) {
1075 vdev
= vdev_find(guid
);
1076 if (vdev
&& vdev
->v_phys_read
) /* Has this vdev already been inited? */
1079 if (nvlist_find(nvlist
, ZPOOL_CONFIG_VDEV_TREE
, DATA_TYPE_NVLIST
,
1084 rc
= vdev_init_from_nvlist(vdevs
, NULL
, &top_vdev
, is_newer
);
1089 * Add the toplevel vdev to the pool if its not already there.
1091 STAILQ_FOREACH(pool_vdev
, &spa
->spa_vdevs
, v_childlink
)
1092 if (top_vdev
== pool_vdev
)
1094 if (!pool_vdev
&& top_vdev
)
1095 STAILQ_INSERT_TAIL(&spa
->spa_vdevs
, top_vdev
, v_childlink
);
1098 * We should already have created an incomplete vdev for this
1099 * vdev. Find it and initialise it with our read proc.
1101 vdev
= vdev_find(guid
);
1103 vdev
->v_phys_read
= phys_read
;
1104 vdev
->v_read_priv
= read_priv
;
1105 vdev
->v_state
= VDEV_STATE_HEALTHY
;
1107 printf("ZFS: inconsistent nvlist contents\n");
1111 /* Record boot vdev for spa. */
1113 spa
->spa_boot_vdev
= vdev
;
1116 * Re-evaluate top-level vdev state.
1118 vdev_set_state(top_vdev
);
1121 * Ok, we are happy with the pool so far. Lets find
1122 * the best uberblock and then we can actually access
1123 * the contents of the pool.
1125 upbuf
= zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev
));
1126 up
= (const struct uberblock
*)upbuf
;
1127 for (l
= 0; l
< VDEV_LABELS
; l
++) {
1128 for (i
= 0; i
< VDEV_UBERBLOCK_COUNT(vdev
); i
++) {
1129 off
= vdev_label_offset(psize
, l
,
1130 VDEV_UBERBLOCK_OFFSET(vdev
, i
));
1132 DVA_SET_OFFSET(&bp
.blk_dva
[0], off
);
1133 BP_SET_LSIZE(&bp
, VDEV_UBERBLOCK_SIZE(vdev
));
1134 BP_SET_PSIZE(&bp
, VDEV_UBERBLOCK_SIZE(vdev
));
1135 BP_SET_CHECKSUM(&bp
, ZIO_CHECKSUM_LABEL
);
1136 BP_SET_COMPRESS(&bp
, ZIO_COMPRESS_OFF
);
1137 ZIO_SET_CHECKSUM(&bp
.blk_cksum
, off
, 0, 0, 0);
1139 if (vdev_read_phys(vdev
, &bp
, upbuf
, off
, 0) != 0)
1142 if (up
->ub_magic
!= UBERBLOCK_MAGIC
)
1144 if (up
->ub_txg
< spa
->spa_txg
)
1146 if (up
->ub_txg
> spa
->spa_uberblock
.ub_txg
||
1147 (up
->ub_txg
== spa
->spa_uberblock
.ub_txg
&&
1149 spa
->spa_uberblock
.ub_timestamp
)) {
1150 spa
->spa_uberblock
= *up
;
1154 zfs_free(upbuf
, VDEV_UBERBLOCK_SIZE(vdev
));
1166 for (v
= 0; v
< 32; v
++)
1173 zio_read_gang(const spa_t
*spa
, const blkptr_t
*bp
, void *buf
)
1176 zio_gbh_phys_t zio_gb
;
1180 /* Artificial BP for gang block header. */
1182 BP_SET_PSIZE(&gbh_bp
, SPA_GANGBLOCKSIZE
);
1183 BP_SET_LSIZE(&gbh_bp
, SPA_GANGBLOCKSIZE
);
1184 BP_SET_CHECKSUM(&gbh_bp
, ZIO_CHECKSUM_GANG_HEADER
);
1185 BP_SET_COMPRESS(&gbh_bp
, ZIO_COMPRESS_OFF
);
1186 for (i
= 0; i
< SPA_DVAS_PER_BP
; i
++)
1187 DVA_SET_GANG(&gbh_bp
.blk_dva
[i
], 0);
1189 /* Read gang header block using the artificial BP. */
1190 if (zio_read(spa
, &gbh_bp
, &zio_gb
))
1194 for (i
= 0; i
< SPA_GBH_NBLKPTRS
; i
++) {
1195 blkptr_t
*gbp
= &zio_gb
.zg_blkptr
[i
];
1197 if (BP_IS_HOLE(gbp
))
1199 if (zio_read(spa
, gbp
, pbuf
))
1201 pbuf
+= BP_GET_PSIZE(gbp
);
1204 if (zio_checksum_verify(bp
, buf
))
1210 zio_read(const spa_t
*spa
, const blkptr_t
*bp
, void *buf
)
1212 int cpfunc
= BP_GET_COMPRESS(bp
);
1213 uint64_t align
, size
;
1218 * Process data embedded in block pointer
1220 if (BP_IS_EMBEDDED(bp
)) {
1221 ASSERT(BPE_GET_ETYPE(bp
) == BP_EMBEDDED_TYPE_DATA
);
1223 size
= BPE_GET_PSIZE(bp
);
1224 ASSERT(size
<= BPE_PAYLOAD_SIZE
);
1226 if (cpfunc
!= ZIO_COMPRESS_OFF
)
1227 pbuf
= zfs_alloc(size
);
1231 decode_embedded_bp_compressed(bp
, pbuf
);
1234 if (cpfunc
!= ZIO_COMPRESS_OFF
) {
1235 error
= zio_decompress_data(cpfunc
, pbuf
,
1236 size
, buf
, BP_GET_LSIZE(bp
));
1237 zfs_free(pbuf
, size
);
1240 printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1247 for (i
= 0; i
< SPA_DVAS_PER_BP
; i
++) {
1248 const dva_t
*dva
= &bp
->blk_dva
[i
];
1253 if (!dva
->dva_word
[0] && !dva
->dva_word
[1])
1256 vdevid
= DVA_GET_VDEV(dva
);
1257 offset
= DVA_GET_OFFSET(dva
);
1258 STAILQ_FOREACH(vdev
, &spa
->spa_vdevs
, v_childlink
) {
1259 if (vdev
->v_id
== vdevid
)
1262 if (!vdev
|| !vdev
->v_read
)
1265 size
= BP_GET_PSIZE(bp
);
1266 if (vdev
->v_read
== vdev_raidz_read
) {
1267 align
= 1ULL << vdev
->v_top
->v_ashift
;
1268 if (P2PHASE(size
, align
) != 0)
1269 size
= P2ROUNDUP(size
, align
);
1271 if (size
!= BP_GET_PSIZE(bp
) || cpfunc
!= ZIO_COMPRESS_OFF
)
1272 pbuf
= zfs_alloc(size
);
1276 if (DVA_GET_GANG(dva
))
1277 error
= zio_read_gang(spa
, bp
, pbuf
);
1279 error
= vdev
->v_read(vdev
, bp
, pbuf
, offset
, size
);
1281 if (cpfunc
!= ZIO_COMPRESS_OFF
)
1282 error
= zio_decompress_data(cpfunc
, pbuf
,
1283 BP_GET_PSIZE(bp
), buf
, BP_GET_LSIZE(bp
));
1284 else if (size
!= BP_GET_PSIZE(bp
))
1285 bcopy(pbuf
, buf
, BP_GET_PSIZE(bp
));
1288 zfs_free(pbuf
, size
);
1293 printf("ZFS: i/o error - all block copies unavailable\n");
1298 dnode_read(const spa_t
*spa
, const dnode_phys_t
*dnode
, off_t offset
, void *buf
, size_t buflen
)
1300 int ibshift
= dnode
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
1301 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1302 int nlevels
= dnode
->dn_nlevels
;
1305 if (bsize
> SPA_MAXBLOCKSIZE
) {
1306 printf("ZFS: I/O error - blocks larger than %llu are not "
1307 "supported\n", SPA_MAXBLOCKSIZE
);
1312 * Note: bsize may not be a power of two here so we need to do an
1313 * actual divide rather than a bitshift.
1315 while (buflen
> 0) {
1316 uint64_t bn
= offset
/ bsize
;
1317 int boff
= offset
% bsize
;
1319 const blkptr_t
*indbp
;
1322 if (bn
> dnode
->dn_maxblkid
) {
1323 printf("warning: zfs bug: bn %llx > dn_maxblkid %llx\n",
1324 (unsigned long long)bn
,
1325 (unsigned long long)dnode
->dn_maxblkid
);
1327 * zfs bug, will not return error
1332 if (dnode
== dnode_cache_obj
&& bn
== dnode_cache_bn
)
1335 indbp
= dnode
->dn_blkptr
;
1336 for (i
= 0; i
< nlevels
; i
++) {
1338 * Copy the bp from the indirect array so that
1339 * we can re-use the scratch buffer for multi-level
1342 ibn
= bn
>> ((nlevels
- i
- 1) * ibshift
);
1343 ibn
&= ((1 << ibshift
) - 1);
1345 if (BP_IS_HOLE(&bp
)) {
1346 memset(dnode_cache_buf
, 0, bsize
);
1349 rc
= zio_read(spa
, &bp
, dnode_cache_buf
);
1352 indbp
= (const blkptr_t
*) dnode_cache_buf
;
1354 dnode_cache_obj
= dnode
;
1355 dnode_cache_bn
= bn
;
1359 * The buffer contains our data block. Copy what we
1360 * need from it and loop.
1363 if (i
> buflen
) i
= buflen
;
1364 memcpy(buf
, &dnode_cache_buf
[boff
], i
);
1365 buf
= ((char*) buf
) + i
;
1374 * Lookup a value in a microzap directory. Assumes that the zap
1375 * scratch buffer contains the directory contents.
1378 mzap_lookup(const dnode_phys_t
*dnode
, const char *name
, uint64_t *value
)
1380 const mzap_phys_t
*mz
;
1381 const mzap_ent_phys_t
*mze
;
1386 * Microzap objects use exactly one block. Read the whole
1389 size
= dnode
->dn_datablkszsec
* 512;
1391 mz
= (const mzap_phys_t
*) zap_scratch
;
1392 chunks
= size
/ MZAP_ENT_LEN
- 1;
1394 for (i
= 0; i
< chunks
; i
++) {
1395 mze
= &mz
->mz_chunk
[i
];
1396 if (!strcmp(mze
->mze_name
, name
)) {
1397 *value
= mze
->mze_value
;
1406 * Compare a name with a zap leaf entry. Return non-zero if the name
1410 fzap_name_equal(const zap_leaf_t
*zl
, const zap_leaf_chunk_t
*zc
, const char *name
)
1413 const zap_leaf_chunk_t
*nc
;
1416 namelen
= zc
->l_entry
.le_name_numints
;
1418 nc
= &ZAP_LEAF_CHUNK(zl
, zc
->l_entry
.le_name_chunk
);
1420 while (namelen
> 0) {
1423 if (len
> ZAP_LEAF_ARRAY_BYTES
)
1424 len
= ZAP_LEAF_ARRAY_BYTES
;
1425 if (memcmp(p
, nc
->l_array
.la_array
, len
))
1429 nc
= &ZAP_LEAF_CHUNK(zl
, nc
->l_array
.la_next
);
1436 * Extract a uint64_t value from a zap leaf entry.
1439 fzap_leaf_value(const zap_leaf_t
*zl
, const zap_leaf_chunk_t
*zc
)
1441 const zap_leaf_chunk_t
*vc
;
1446 vc
= &ZAP_LEAF_CHUNK(zl
, zc
->l_entry
.le_value_chunk
);
1447 for (i
= 0, value
= 0, p
= vc
->l_array
.la_array
; i
< 8; i
++) {
1448 value
= (value
<< 8) | p
[i
];
1455 * Lookup a value in a fatzap directory. Assumes that the zap scratch
1456 * buffer contains the directory header.
1459 fzap_lookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, const char *name
, uint64_t *value
)
1461 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1462 zap_phys_t zh
= *(zap_phys_t
*) zap_scratch
;
1468 if (zh
.zap_magic
!= ZAP_MAGIC
)
1471 z
.zap_block_shift
= ilog2(bsize
);
1472 z
.zap_phys
= (zap_phys_t
*) zap_scratch
;
1475 * Figure out where the pointer table is and read it in if necessary.
1477 if (zh
.zap_ptrtbl
.zt_blk
) {
1478 rc
= dnode_read(spa
, dnode
, zh
.zap_ptrtbl
.zt_blk
* bsize
,
1479 zap_scratch
, bsize
);
1482 ptrtbl
= (uint64_t *) zap_scratch
;
1484 ptrtbl
= &ZAP_EMBEDDED_PTRTBL_ENT(&z
, 0);
1487 hash
= zap_hash(zh
.zap_salt
, name
);
1490 zl
.l_bs
= z
.zap_block_shift
;
1492 off_t off
= ptrtbl
[hash
>> (64 - zh
.zap_ptrtbl
.zt_shift
)] << zl
.l_bs
;
1493 zap_leaf_chunk_t
*zc
;
1495 rc
= dnode_read(spa
, dnode
, off
, zap_scratch
, bsize
);
1499 zl
.l_phys
= (zap_leaf_phys_t
*) zap_scratch
;
1502 * Make sure this chunk matches our hash.
1504 if (zl
.l_phys
->l_hdr
.lh_prefix_len
> 0
1505 && zl
.l_phys
->l_hdr
.lh_prefix
1506 != hash
>> (64 - zl
.l_phys
->l_hdr
.lh_prefix_len
))
1510 * Hash within the chunk to find our entry.
1512 int shift
= (64 - ZAP_LEAF_HASH_SHIFT(&zl
) - zl
.l_phys
->l_hdr
.lh_prefix_len
);
1513 int h
= (hash
>> shift
) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl
)) - 1);
1514 h
= zl
.l_phys
->l_hash
[h
];
1517 zc
= &ZAP_LEAF_CHUNK(&zl
, h
);
1518 while (zc
->l_entry
.le_hash
!= hash
) {
1519 if (zc
->l_entry
.le_next
== 0xffff) {
1523 zc
= &ZAP_LEAF_CHUNK(&zl
, zc
->l_entry
.le_next
);
1525 if (fzap_name_equal(&zl
, zc
, name
)) {
1526 if (zc
->l_entry
.le_value_intlen
* zc
->l_entry
.le_value_numints
> 8)
1528 *value
= fzap_leaf_value(&zl
, zc
);
1536 * Lookup a name in a zap object and return its value as a uint64_t.
1539 zap_lookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, const char *name
, uint64_t *value
)
1543 size_t size
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1545 rc
= dnode_read(spa
, dnode
, 0, zap_scratch
, size
);
1549 zap_type
= *(uint64_t *) zap_scratch
;
1550 if (zap_type
== ZBT_MICRO
)
1551 return mzap_lookup(dnode
, name
, value
);
1552 else if (zap_type
== ZBT_HEADER
)
1553 return fzap_lookup(spa
, dnode
, name
, value
);
1554 printf("ZFS: invalid zap_type=%d\n", (int)zap_type
);
1559 * List a microzap directory. Assumes that the zap scratch buffer contains
1560 * the directory contents.
1563 mzap_list(const dnode_phys_t
*dnode
, int (*callback
)(const char *, uint64_t))
1565 const mzap_phys_t
*mz
;
1566 const mzap_ent_phys_t
*mze
;
1571 * Microzap objects use exactly one block. Read the whole
1574 size
= dnode
->dn_datablkszsec
* 512;
1575 mz
= (const mzap_phys_t
*) zap_scratch
;
1576 chunks
= size
/ MZAP_ENT_LEN
- 1;
1578 for (i
= 0; i
< chunks
; i
++) {
1579 mze
= &mz
->mz_chunk
[i
];
1580 if (mze
->mze_name
[0]) {
1581 rc
= callback(mze
->mze_name
, mze
->mze_value
);
1591 * List a fatzap directory. Assumes that the zap scratch buffer contains
1592 * the directory header.
1595 fzap_list(const spa_t
*spa
, const dnode_phys_t
*dnode
, int (*callback
)(const char *, uint64_t))
1597 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1598 zap_phys_t zh
= *(zap_phys_t
*) zap_scratch
;
1602 if (zh
.zap_magic
!= ZAP_MAGIC
)
1605 z
.zap_block_shift
= ilog2(bsize
);
1606 z
.zap_phys
= (zap_phys_t
*) zap_scratch
;
1609 * This assumes that the leaf blocks start at block 1. The
1610 * documentation isn't exactly clear on this.
1613 zl
.l_bs
= z
.zap_block_shift
;
1614 for (i
= 0; i
< zh
.zap_num_leafs
; i
++) {
1615 off_t off
= (i
+ 1) << zl
.l_bs
;
1619 if (dnode_read(spa
, dnode
, off
, zap_scratch
, bsize
))
1622 zl
.l_phys
= (zap_leaf_phys_t
*) zap_scratch
;
1624 for (j
= 0; j
< ZAP_LEAF_NUMCHUNKS(&zl
); j
++) {
1625 zap_leaf_chunk_t
*zc
, *nc
;
1628 zc
= &ZAP_LEAF_CHUNK(&zl
, j
);
1629 if (zc
->l_entry
.le_type
!= ZAP_CHUNK_ENTRY
)
1631 namelen
= zc
->l_entry
.le_name_numints
;
1632 if (namelen
> sizeof(name
))
1633 namelen
= sizeof(name
);
1636 * Paste the name back together.
1638 nc
= &ZAP_LEAF_CHUNK(&zl
, zc
->l_entry
.le_name_chunk
);
1640 while (namelen
> 0) {
1643 if (len
> ZAP_LEAF_ARRAY_BYTES
)
1644 len
= ZAP_LEAF_ARRAY_BYTES
;
1645 memcpy(p
, nc
->l_array
.la_array
, len
);
1648 nc
= &ZAP_LEAF_CHUNK(&zl
, nc
->l_array
.la_next
);
1652 * Assume the first eight bytes of the value are
1655 value
= fzap_leaf_value(&zl
, zc
);
1657 //printf("%s 0x%jx\n", name, (uintmax_t)value);
1658 rc
= callback((const char *)name
, value
);
1667 static int zfs_printf(const char *name
, uint64_t value __unused
)
1670 printf("%s\n", name
);
1676 * List a zap directory.
1679 zap_list(const spa_t
*spa
, const dnode_phys_t
*dnode
)
1682 size_t size
= dnode
->dn_datablkszsec
* 512;
1684 if (dnode_read(spa
, dnode
, 0, zap_scratch
, size
))
1687 zap_type
= *(uint64_t *) zap_scratch
;
1688 if (zap_type
== ZBT_MICRO
)
1689 return mzap_list(dnode
, zfs_printf
);
1691 return fzap_list(spa
, dnode
, zfs_printf
);
1695 objset_get_dnode(const spa_t
*spa
, const objset_phys_t
*os
, uint64_t objnum
, dnode_phys_t
*dnode
)
1699 offset
= objnum
* sizeof(dnode_phys_t
);
1700 return dnode_read(spa
, &os
->os_meta_dnode
, offset
,
1701 dnode
, sizeof(dnode_phys_t
));
1705 mzap_rlookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, char *name
, uint64_t value
)
1707 const mzap_phys_t
*mz
;
1708 const mzap_ent_phys_t
*mze
;
1713 * Microzap objects use exactly one block. Read the whole
1716 size
= dnode
->dn_datablkszsec
* 512;
1718 mz
= (const mzap_phys_t
*) zap_scratch
;
1719 chunks
= size
/ MZAP_ENT_LEN
- 1;
1721 for (i
= 0; i
< chunks
; i
++) {
1722 mze
= &mz
->mz_chunk
[i
];
1723 if (value
== mze
->mze_value
) {
1724 strcpy(name
, mze
->mze_name
);
1733 fzap_name_copy(const zap_leaf_t
*zl
, const zap_leaf_chunk_t
*zc
, char *name
)
1736 const zap_leaf_chunk_t
*nc
;
1739 namelen
= zc
->l_entry
.le_name_numints
;
1741 nc
= &ZAP_LEAF_CHUNK(zl
, zc
->l_entry
.le_name_chunk
);
1743 while (namelen
> 0) {
1746 if (len
> ZAP_LEAF_ARRAY_BYTES
)
1747 len
= ZAP_LEAF_ARRAY_BYTES
;
1748 memcpy(p
, nc
->l_array
.la_array
, len
);
1751 nc
= &ZAP_LEAF_CHUNK(zl
, nc
->l_array
.la_next
);
1758 fzap_rlookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, char *name
, uint64_t value
)
1760 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1761 zap_phys_t zh
= *(zap_phys_t
*) zap_scratch
;
1765 if (zh
.zap_magic
!= ZAP_MAGIC
)
1768 z
.zap_block_shift
= ilog2(bsize
);
1769 z
.zap_phys
= (zap_phys_t
*) zap_scratch
;
1772 * This assumes that the leaf blocks start at block 1. The
1773 * documentation isn't exactly clear on this.
1776 zl
.l_bs
= z
.zap_block_shift
;
1777 for (i
= 0; i
< zh
.zap_num_leafs
; i
++) {
1778 off_t off
= (i
+ 1) << zl
.l_bs
;
1780 if (dnode_read(spa
, dnode
, off
, zap_scratch
, bsize
))
1783 zl
.l_phys
= (zap_leaf_phys_t
*) zap_scratch
;
1785 for (j
= 0; j
< ZAP_LEAF_NUMCHUNKS(&zl
); j
++) {
1786 zap_leaf_chunk_t
*zc
;
1788 zc
= &ZAP_LEAF_CHUNK(&zl
, j
);
1789 if (zc
->l_entry
.le_type
!= ZAP_CHUNK_ENTRY
)
1791 if (zc
->l_entry
.le_value_intlen
!= 8 ||
1792 zc
->l_entry
.le_value_numints
!= 1)
1795 if (fzap_leaf_value(&zl
, zc
) == value
) {
1796 fzap_name_copy(&zl
, zc
, name
);
1806 zap_rlookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, char *name
, uint64_t value
)
1810 size_t size
= dnode
->dn_datablkszsec
* 512;
1812 rc
= dnode_read(spa
, dnode
, 0, zap_scratch
, size
);
1816 zap_type
= *(uint64_t *) zap_scratch
;
1817 if (zap_type
== ZBT_MICRO
)
1818 return mzap_rlookup(spa
, dnode
, name
, value
);
1820 return fzap_rlookup(spa
, dnode
, name
, value
);
1824 zfs_rlookup(const spa_t
*spa
, uint64_t objnum
, char *result
)
1827 char component
[256];
1828 uint64_t dir_obj
, parent_obj
, child_dir_zapobj
;
1829 dnode_phys_t child_dir_zap
, dataset
, dir
, parent
;
1831 dsl_dataset_phys_t
*ds
;
1835 p
= &name
[sizeof(name
) - 1];
1838 if (objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
)) {
1839 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
1842 ds
= (dsl_dataset_phys_t
*)&dataset
.dn_bonus
;
1843 dir_obj
= ds
->ds_dir_obj
;
1846 if (objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
) != 0)
1848 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
1850 /* Actual loop condition. */
1851 parent_obj
= dd
->dd_parent_obj
;
1852 if (parent_obj
== 0)
1855 if (objset_get_dnode(spa
, &spa
->spa_mos
, parent_obj
, &parent
) != 0)
1857 dd
= (dsl_dir_phys_t
*)&parent
.dn_bonus
;
1858 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
1859 if (objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
) != 0)
1861 if (zap_rlookup(spa
, &child_dir_zap
, component
, dir_obj
) != 0)
1864 len
= strlen(component
);
1866 memcpy(p
, component
, len
);
1870 /* Actual loop iteration. */
1871 dir_obj
= parent_obj
;
1882 zfs_lookup_dataset(const spa_t
*spa
, const char *name
, uint64_t *objnum
)
1885 uint64_t dir_obj
, child_dir_zapobj
;
1886 dnode_phys_t child_dir_zap
, dir
;
1890 if (objset_get_dnode(spa
, &spa
->spa_mos
, DMU_POOL_DIRECTORY_OBJECT
, &dir
))
1892 if (zap_lookup(spa
, &dir
, DMU_POOL_ROOT_DATASET
, &dir_obj
))
1897 if (objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
))
1899 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
1903 /* Actual loop condition #1. */
1909 memcpy(element
, p
, q
- p
);
1910 element
[q
- p
] = '\0';
1917 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
1918 if (objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
) != 0)
1921 /* Actual loop condition #2. */
1922 if (zap_lookup(spa
, &child_dir_zap
, element
, &dir_obj
) != 0)
1926 *objnum
= dd
->dd_head_dataset_obj
;
1930 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
1932 zfs_list_dataset(const spa_t
*spa
, uint64_t objnum
/*, int pos, char *entry*/)
1934 uint64_t dir_obj
, child_dir_zapobj
;
1935 dnode_phys_t child_dir_zap
, dir
, dataset
;
1936 dsl_dataset_phys_t
*ds
;
1939 if (objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
)) {
1940 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
1943 ds
= (dsl_dataset_phys_t
*) &dataset
.dn_bonus
;
1944 dir_obj
= ds
->ds_dir_obj
;
1946 if (objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
)) {
1947 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj
);
1950 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
1952 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
1953 if (objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
) != 0) {
1954 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj
);
1958 return (zap_list(spa
, &child_dir_zap
) != 0);
1962 zfs_callback_dataset(const spa_t
*spa
, uint64_t objnum
, int (*callback
)(const char *, uint64_t))
1964 uint64_t dir_obj
, child_dir_zapobj
, zap_type
;
1965 dnode_phys_t child_dir_zap
, dir
, dataset
;
1966 dsl_dataset_phys_t
*ds
;
1970 err
= objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
);
1972 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
1975 ds
= (dsl_dataset_phys_t
*) &dataset
.dn_bonus
;
1976 dir_obj
= ds
->ds_dir_obj
;
1978 err
= objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
);
1980 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj
);
1983 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
1985 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
1986 err
= objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
);
1988 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj
);
1992 err
= dnode_read(spa
, &child_dir_zap
, 0, zap_scratch
, child_dir_zap
.dn_datablkszsec
* 512);
1996 zap_type
= *(uint64_t *) zap_scratch
;
1997 if (zap_type
== ZBT_MICRO
)
1998 return mzap_list(&child_dir_zap
, callback
);
2000 return fzap_list(spa
, &child_dir_zap
, callback
);
2004 * Find the object set given the object number of its dataset object
2005 * and return its details in *objset
2008 zfs_mount_dataset(const spa_t
*spa
, uint64_t objnum
, objset_phys_t
*objset
)
2010 dnode_phys_t dataset
;
2011 dsl_dataset_phys_t
*ds
;
2013 if (objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
)) {
2014 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
2018 ds
= (dsl_dataset_phys_t
*) &dataset
.dn_bonus
;
2019 if (zio_read(spa
, &ds
->ds_bp
, objset
)) {
2020 printf("ZFS: can't read object set for dataset %ju\n",
2029 * Find the object set pointed to by the BOOTFS property or the root
2030 * dataset if there is none and return its details in *objset
2033 zfs_get_root(const spa_t
*spa
, uint64_t *objid
)
2035 dnode_phys_t dir
, propdir
;
2036 uint64_t props
, bootfs
, root
;
2041 * Start with the MOS directory object.
2043 if (objset_get_dnode(spa
, &spa
->spa_mos
, DMU_POOL_DIRECTORY_OBJECT
, &dir
)) {
2044 printf("ZFS: can't read MOS object directory\n");
2049 * Lookup the pool_props and see if we can find a bootfs.
2051 if (zap_lookup(spa
, &dir
, DMU_POOL_PROPS
, &props
) == 0
2052 && objset_get_dnode(spa
, &spa
->spa_mos
, props
, &propdir
) == 0
2053 && zap_lookup(spa
, &propdir
, "bootfs", &bootfs
) == 0
2060 * Lookup the root dataset directory
2062 if (zap_lookup(spa
, &dir
, DMU_POOL_ROOT_DATASET
, &root
)
2063 || objset_get_dnode(spa
, &spa
->spa_mos
, root
, &dir
)) {
2064 printf("ZFS: can't find root dsl_dir\n");
2069 * Use the information from the dataset directory's bonus buffer
2070 * to find the dataset object and from that the object set itself.
2072 dsl_dir_phys_t
*dd
= (dsl_dir_phys_t
*) &dir
.dn_bonus
;
2073 *objid
= dd
->dd_head_dataset_obj
;
2078 zfs_mount(const spa_t
*spa
, uint64_t rootobj
, struct zfsmount
*mnt
)
2084 * Find the root object set if not explicitly provided
2086 if (rootobj
== 0 && zfs_get_root(spa
, &rootobj
)) {
2087 printf("ZFS: can't find root filesystem\n");
2091 if (zfs_mount_dataset(spa
, rootobj
, &mnt
->objset
)) {
2092 printf("ZFS: can't open root filesystem\n");
2096 mnt
->rootobj
= rootobj
;
2102 * callback function for feature name checks.
2105 check_feature(const char *name
, uint64_t value
)
2111 if (name
[0] == '\0')
2114 for (i
= 0; features_for_read
[i
] != NULL
; i
++) {
2115 if (strcmp(name
, features_for_read
[i
]) == 0)
2118 printf("ZFS: unsupported feature: %s\n", name
);
2123 * Checks whether the MOS features that are active are supported.
2126 check_mos_features(const spa_t
*spa
)
2129 uint64_t objnum
, zap_type
;
2133 if ((rc
= objset_get_dnode(spa
, &spa
->spa_mos
, DMU_OT_OBJECT_DIRECTORY
,
2136 if ((rc
= zap_lookup(spa
, &dir
, DMU_POOL_FEATURES_FOR_READ
,
2139 * It is older pool without features. As we have already
2140 * tested the label, just return without raising the error.
2147 if ((rc
= objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dir
)) != 0)
2150 if (dir
.dn_type
!= DMU_OTN_ZAP_METADATA
)
2153 size
= dir
.dn_datablkszsec
* 512;
2154 if (dnode_read(spa
, &dir
, 0, zap_scratch
, size
))
2157 zap_type
= *(uint64_t *) zap_scratch
;
2158 if (zap_type
== ZBT_MICRO
)
2159 rc
= mzap_list(&dir
, check_feature
);
2161 rc
= fzap_list(spa
, &dir
, check_feature
);
2167 zfs_spa_init(spa_t
*spa
)
2171 if (zio_read(spa
, &spa
->spa_uberblock
.ub_rootbp
, &spa
->spa_mos
)) {
2172 printf("ZFS: can't read MOS of pool %s\n", spa
->spa_name
);
2175 if (spa
->spa_mos
.os_type
!= DMU_OST_META
) {
2176 printf("ZFS: corrupted MOS of pool %s\n", spa
->spa_name
);
2180 rc
= check_mos_features(spa
);
2182 printf("ZFS: pool %s is not supported\n", spa
->spa_name
);
2189 zfs_dnode_stat(const spa_t
*spa
, dnode_phys_t
*dn
, struct stat
*sb
)
2192 if (dn
->dn_bonustype
!= DMU_OT_SA
) {
2193 znode_phys_t
*zp
= (znode_phys_t
*)dn
->dn_bonus
;
2195 sb
->st_mode
= zp
->zp_mode
;
2196 sb
->st_uid
= zp
->zp_uid
;
2197 sb
->st_gid
= zp
->zp_gid
;
2198 sb
->st_size
= zp
->zp_size
;
2200 sa_hdr_phys_t
*sahdrp
;
2205 if (dn
->dn_bonuslen
!= 0)
2206 sahdrp
= (sa_hdr_phys_t
*)DN_BONUS(dn
);
2208 if ((dn
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) != 0) {
2209 blkptr_t
*bp
= DN_SPILL_BLKPTR(dn
);
2212 size
= BP_GET_LSIZE(bp
);
2213 buf
= zfs_alloc(size
);
2214 error
= zio_read(spa
, bp
, buf
);
2216 zfs_free(buf
, size
);
2224 hdrsize
= SA_HDR_SIZE(sahdrp
);
2225 sb
->st_mode
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2227 sb
->st_uid
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2229 sb
->st_gid
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2231 sb
->st_size
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2234 zfs_free(buf
, size
);
2241 zfs_dnode_readlink(const spa_t
*spa
, dnode_phys_t
*dn
, char *path
, size_t psize
)
2245 if (dn
->dn_bonustype
== DMU_OT_SA
) {
2246 sa_hdr_phys_t
*sahdrp
= NULL
;
2252 if (dn
->dn_bonuslen
!= 0)
2253 sahdrp
= (sa_hdr_phys_t
*)DN_BONUS(dn
);
2257 if ((dn
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) == 0)
2259 bp
= DN_SPILL_BLKPTR(dn
);
2261 size
= BP_GET_LSIZE(bp
);
2262 buf
= zfs_alloc(size
);
2263 rc
= zio_read(spa
, bp
, buf
);
2265 zfs_free(buf
, size
);
2270 hdrsize
= SA_HDR_SIZE(sahdrp
);
2271 p
= (char *)((uintptr_t)sahdrp
+ hdrsize
+ SA_SYMLINK_OFFSET
);
2272 memcpy(path
, p
, psize
);
2274 zfs_free(buf
, size
);
2278 * Second test is purely to silence bogus compiler
2279 * warning about accessing past the end of dn_bonus.
2281 if (psize
+ sizeof(znode_phys_t
) <= dn
->dn_bonuslen
&&
2282 sizeof(znode_phys_t
) <= sizeof(dn
->dn_bonus
)) {
2283 memcpy(path
, &dn
->dn_bonus
[sizeof(znode_phys_t
)], psize
);
2285 rc
= dnode_read(spa
, dn
, 0, path
, psize
);
2292 STAILQ_ENTRY(obj_list
) entry
;
2296 * Lookup a file and return its dnode.
2299 zfs_lookup(const struct zfsmount
*mnt
, const char *upath
, dnode_phys_t
*dnode
)
2308 int symlinks_followed
= 0;
2310 struct obj_list
*entry
, *tentry
;
2311 STAILQ_HEAD(, obj_list
) on_cache
= STAILQ_HEAD_INITIALIZER(on_cache
);
2314 if (mnt
->objset
.os_type
!= DMU_OST_ZFS
) {
2315 printf("ZFS: unexpected object set type %ju\n",
2316 (uintmax_t)mnt
->objset
.os_type
);
2320 if ((entry
= malloc(sizeof(struct obj_list
))) == NULL
)
2324 * Get the root directory dnode.
2326 rc
= objset_get_dnode(spa
, &mnt
->objset
, MASTER_NODE_OBJ
, &dn
);
2332 rc
= zap_lookup(spa
, &dn
, ZFS_ROOT_OBJ
, &objnum
);
2337 entry
->objnum
= objnum
;
2338 STAILQ_INSERT_HEAD(&on_cache
, entry
, entry
);
2340 rc
= objset_get_dnode(spa
, &mnt
->objset
, objnum
, &dn
);
2346 rc
= objset_get_dnode(spa
, &mnt
->objset
, objnum
, &dn
);
2355 while (*q
!= '\0' && *q
!= '/')
2359 if (p
+ 1 == q
&& p
[0] == '.') {
2364 if (p
+ 2 == q
&& p
[0] == '.' && p
[1] == '.') {
2366 if (STAILQ_FIRST(&on_cache
) ==
2367 STAILQ_LAST(&on_cache
, obj_list
, entry
)) {
2371 entry
= STAILQ_FIRST(&on_cache
);
2372 STAILQ_REMOVE_HEAD(&on_cache
, entry
);
2374 objnum
= (STAILQ_FIRST(&on_cache
))->objnum
;
2377 if (q
- p
+ 1 > sizeof(element
)) {
2381 memcpy(element
, p
, q
- p
);
2385 if ((rc
= zfs_dnode_stat(spa
, &dn
, &sb
)) != 0)
2387 if (!S_ISDIR(sb
.st_mode
)) {
2392 rc
= zap_lookup(spa
, &dn
, element
, &objnum
);
2395 objnum
= ZFS_DIRENT_OBJ(objnum
);
2397 if ((entry
= malloc(sizeof(struct obj_list
))) == NULL
) {
2401 entry
->objnum
= objnum
;
2402 STAILQ_INSERT_HEAD(&on_cache
, entry
, entry
);
2403 rc
= objset_get_dnode(spa
, &mnt
->objset
, objnum
, &dn
);
2408 * Check for symlink.
2410 rc
= zfs_dnode_stat(spa
, &dn
, &sb
);
2413 if (S_ISLNK(sb
.st_mode
)) {
2414 if (symlinks_followed
> 10) {
2418 symlinks_followed
++;
2421 * Read the link value and copy the tail of our
2422 * current path onto the end.
2424 if (sb
.st_size
+ strlen(p
) + 1 > sizeof(path
)) {
2428 strcpy(&path
[sb
.st_size
], p
);
2430 rc
= zfs_dnode_readlink(spa
, &dn
, path
, sb
.st_size
);
2435 * Restart with the new path, starting either at
2436 * the root or at the parent depending whether or
2437 * not the link is relative.
2441 while (STAILQ_FIRST(&on_cache
) !=
2442 STAILQ_LAST(&on_cache
, obj_list
, entry
)) {
2443 entry
= STAILQ_FIRST(&on_cache
);
2444 STAILQ_REMOVE_HEAD(&on_cache
, entry
);
2448 entry
= STAILQ_FIRST(&on_cache
);
2449 STAILQ_REMOVE_HEAD(&on_cache
, entry
);
2452 objnum
= (STAILQ_FIRST(&on_cache
))->objnum
;
2458 STAILQ_FOREACH_SAFE(entry
, &on_cache
, entry
, tentry
)