2 * Copyright (c) 2007 Doug Rabson
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 #include <sys/cdefs.h>
30 * Stand-alone ZFS file reader.
34 #include <sys/stdint.h>
47 * List of all vdevs, chained through v_alllink.
49 static vdev_list_t zfs_vdevs
;
52 * List of ZFS features supported for read
54 static const char *features_for_read
[] = {
55 "org.illumos:lz4_compress",
56 "com.delphix:hole_birth",
57 "com.delphix:extensible_dataset",
58 "com.delphix:embedded_data",
59 "org.open-zfs:large_blocks",
63 "org.zfsonlinux:large_dnode",
64 "com.joyent:multi_vdev_crash_dump",
69 * List of all pools, chained through spa_link.
71 static spa_list_t zfs_pools
;
73 static const dnode_phys_t
*dnode_cache_obj
;
74 static uint64_t dnode_cache_bn
;
75 static char *dnode_cache_buf
;
76 static char *zap_scratch
;
77 static char *zfs_temp_buf
, *zfs_temp_end
, *zfs_temp_ptr
;
79 #define TEMP_SIZE (1024 * 1024)
81 static int zio_read(const spa_t
*spa
, const blkptr_t
*bp
, void *buf
);
82 static int zfs_get_root(const spa_t
*spa
, uint64_t *objid
);
83 static int zfs_rlookup(const spa_t
*spa
, uint64_t objnum
, char *result
);
84 static int zap_lookup(const spa_t
*spa
, const dnode_phys_t
*dnode
,
85 const char *name
, uint64_t integer_size
, uint64_t num_integers
,
91 STAILQ_INIT(&zfs_vdevs
);
92 STAILQ_INIT(&zfs_pools
);
94 zfs_temp_buf
= malloc(TEMP_SIZE
);
95 zfs_temp_end
= zfs_temp_buf
+ TEMP_SIZE
;
96 zfs_temp_ptr
= zfs_temp_buf
;
97 dnode_cache_buf
= malloc(SPA_MAXBLOCKSIZE
);
98 zap_scratch
= malloc(SPA_MAXBLOCKSIZE
);
104 zfs_alloc(size_t size
)
108 if (zfs_temp_ptr
+ size
> zfs_temp_end
) {
109 printf("ZFS: out of temporary buffer space\n");
113 zfs_temp_ptr
+= size
;
119 zfs_free(void *ptr
, size_t size
)
122 zfs_temp_ptr
-= size
;
123 if (zfs_temp_ptr
!= ptr
) {
124 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
130 xdr_int(const unsigned char **xdr
, int *ip
)
132 *ip
= ((*xdr
)[0] << 24)
141 xdr_u_int(const unsigned char **xdr
, u_int
*ip
)
143 *ip
= ((*xdr
)[0] << 24)
152 xdr_uint64_t(const unsigned char **xdr
, uint64_t *lp
)
158 *lp
= (((uint64_t) hi
) << 32) | lo
;
163 nvlist_find(const unsigned char *nvlist
, const char *name
, int type
,
164 int* elementsp
, void *valuep
)
166 const unsigned char *p
, *pair
;
168 int encoded_size
, decoded_size
;
175 xdr_int(&p
, &encoded_size
);
176 xdr_int(&p
, &decoded_size
);
177 while (encoded_size
&& decoded_size
) {
178 int namelen
, pairtype
, elements
;
179 const char *pairname
;
181 xdr_int(&p
, &namelen
);
182 pairname
= (const char*) p
;
183 p
+= roundup(namelen
, 4);
184 xdr_int(&p
, &pairtype
);
186 if (!memcmp(name
, pairname
, namelen
) && type
== pairtype
) {
187 xdr_int(&p
, &elements
);
189 *elementsp
= elements
;
190 if (type
== DATA_TYPE_UINT64
) {
191 xdr_uint64_t(&p
, (uint64_t *) valuep
);
193 } else if (type
== DATA_TYPE_STRING
) {
196 (*(const char**) valuep
) = (const char*) p
;
198 } else if (type
== DATA_TYPE_NVLIST
199 || type
== DATA_TYPE_NVLIST_ARRAY
) {
200 (*(const unsigned char**) valuep
) =
201 (const unsigned char*) p
;
208 * Not the pair we are looking for, skip to the next one.
210 p
= pair
+ encoded_size
;
214 xdr_int(&p
, &encoded_size
);
215 xdr_int(&p
, &decoded_size
);
222 nvlist_check_features_for_read(const unsigned char *nvlist
)
224 const unsigned char *p
, *pair
;
226 int encoded_size
, decoded_size
;
236 xdr_int(&p
, &encoded_size
);
237 xdr_int(&p
, &decoded_size
);
238 while (encoded_size
&& decoded_size
) {
239 int namelen
, pairtype
;
240 const char *pairname
;
245 xdr_int(&p
, &namelen
);
246 pairname
= (const char*) p
;
247 p
+= roundup(namelen
, 4);
248 xdr_int(&p
, &pairtype
);
250 for (i
= 0; features_for_read
[i
] != NULL
; i
++) {
251 if (!memcmp(pairname
, features_for_read
[i
], namelen
)) {
258 printf("ZFS: unsupported feature: %s\n", pairname
);
262 p
= pair
+ encoded_size
;
265 xdr_int(&p
, &encoded_size
);
266 xdr_int(&p
, &decoded_size
);
273 * Return the next nvlist in an nvlist array.
275 static const unsigned char *
276 nvlist_next(const unsigned char *nvlist
)
278 const unsigned char *p
, *pair
;
280 int encoded_size
, decoded_size
;
287 xdr_int(&p
, &encoded_size
);
288 xdr_int(&p
, &decoded_size
);
289 while (encoded_size
&& decoded_size
) {
290 p
= pair
+ encoded_size
;
293 xdr_int(&p
, &encoded_size
);
294 xdr_int(&p
, &decoded_size
);
302 static const unsigned char *
303 nvlist_print(const unsigned char *nvlist
, unsigned int indent
)
305 static const char* typenames
[] = {
316 "DATA_TYPE_BYTE_ARRAY",
317 "DATA_TYPE_INT16_ARRAY",
318 "DATA_TYPE_UINT16_ARRAY",
319 "DATA_TYPE_INT32_ARRAY",
320 "DATA_TYPE_UINT32_ARRAY",
321 "DATA_TYPE_INT64_ARRAY",
322 "DATA_TYPE_UINT64_ARRAY",
323 "DATA_TYPE_STRING_ARRAY",
326 "DATA_TYPE_NVLIST_ARRAY",
327 "DATA_TYPE_BOOLEAN_VALUE",
330 "DATA_TYPE_BOOLEAN_ARRAY",
331 "DATA_TYPE_INT8_ARRAY",
332 "DATA_TYPE_UINT8_ARRAY"
336 const unsigned char *p
, *pair
;
338 int encoded_size
, decoded_size
;
345 xdr_int(&p
, &encoded_size
);
346 xdr_int(&p
, &decoded_size
);
347 while (encoded_size
&& decoded_size
) {
348 int namelen
, pairtype
, elements
;
349 const char *pairname
;
351 xdr_int(&p
, &namelen
);
352 pairname
= (const char*) p
;
353 p
+= roundup(namelen
, 4);
354 xdr_int(&p
, &pairtype
);
356 for (i
= 0; i
< indent
; i
++)
358 printf("%s %s", typenames
[pairtype
], pairname
);
360 xdr_int(&p
, &elements
);
362 case DATA_TYPE_UINT64
: {
364 xdr_uint64_t(&p
, &val
);
365 printf(" = 0x%jx\n", (uintmax_t)val
);
369 case DATA_TYPE_STRING
: {
372 printf(" = \"%s\"\n", p
);
376 case DATA_TYPE_NVLIST
:
378 nvlist_print(p
, indent
+ 1);
381 case DATA_TYPE_NVLIST_ARRAY
:
382 for (j
= 0; j
< elements
; j
++) {
384 p
= nvlist_print(p
, indent
+ 1);
385 if (j
!= elements
- 1) {
386 for (i
= 0; i
< indent
; i
++)
388 printf("%s %s", typenames
[pairtype
], pairname
);
397 p
= pair
+ encoded_size
;
400 xdr_int(&p
, &encoded_size
);
401 xdr_int(&p
, &decoded_size
);
410 vdev_read_phys(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
411 off_t offset
, size_t size
)
416 if (!vdev
->v_phys_read
)
420 psize
= BP_GET_PSIZE(bp
);
425 /*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
426 rc
= vdev
->v_phys_read(vdev
, vdev
->v_read_priv
, offset
, buf
, psize
);
429 if (bp
&& zio_checksum_verify(vdev
->spa
, bp
, buf
))
436 vdev_disk_read(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
437 off_t offset
, size_t bytes
)
440 return (vdev_read_phys(vdev
, bp
, buf
,
441 offset
+ VDEV_LABEL_START_SIZE
, bytes
));
446 vdev_mirror_read(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
447 off_t offset
, size_t bytes
)
453 STAILQ_FOREACH(kid
, &vdev
->v_children
, v_childlink
) {
454 if (kid
->v_state
!= VDEV_STATE_HEALTHY
)
456 rc
= kid
->v_read(kid
, bp
, buf
, offset
, bytes
);
465 vdev_replacing_read(vdev_t
*vdev
, const blkptr_t
*bp
, void *buf
,
466 off_t offset
, size_t bytes
)
471 * Here we should have two kids:
472 * First one which is the one we are replacing and we can trust
473 * only this one to have valid data, but it might not be present.
474 * Second one is that one we are replacing with. It is most likely
475 * healthy, but we can't trust it has needed data, so we won't use it.
477 kid
= STAILQ_FIRST(&vdev
->v_children
);
480 if (kid
->v_state
!= VDEV_STATE_HEALTHY
)
482 return (kid
->v_read(kid
, bp
, buf
, offset
, bytes
));
486 vdev_find(uint64_t guid
)
490 STAILQ_FOREACH(vdev
, &zfs_vdevs
, v_alllink
)
491 if (vdev
->v_guid
== guid
)
498 vdev_create(uint64_t guid
, vdev_read_t
*vdev_read
)
502 vdev
= malloc(sizeof(vdev_t
));
503 memset(vdev
, 0, sizeof(vdev_t
));
504 STAILQ_INIT(&vdev
->v_children
);
506 vdev
->v_state
= VDEV_STATE_OFFLINE
;
507 vdev
->v_read
= vdev_read
;
508 vdev
->v_phys_read
= 0;
509 vdev
->v_read_priv
= 0;
510 STAILQ_INSERT_TAIL(&zfs_vdevs
, vdev
, v_alllink
);
516 vdev_init_from_nvlist(const unsigned char *nvlist
, vdev_t
*pvdev
,
517 vdev_t
**vdevp
, int is_newer
)
520 uint64_t guid
, id
, ashift
, nparity
;
524 const unsigned char *kids
;
525 int nkids
, i
, is_new
;
526 uint64_t is_offline
, is_faulted
, is_degraded
, is_removed
, isnt_present
;
528 if (nvlist_find(nvlist
, ZPOOL_CONFIG_GUID
, DATA_TYPE_UINT64
,
530 nvlist_find(nvlist
, ZPOOL_CONFIG_ID
, DATA_TYPE_UINT64
, NULL
, &id
) ||
531 nvlist_find(nvlist
, ZPOOL_CONFIG_TYPE
, DATA_TYPE_STRING
,
533 printf("ZFS: can't find vdev details\n");
537 if (strcmp(type
, VDEV_TYPE_MIRROR
)
538 && strcmp(type
, VDEV_TYPE_DISK
)
540 && strcmp(type
, VDEV_TYPE_FILE
)
542 && strcmp(type
, VDEV_TYPE_RAIDZ
)
543 && strcmp(type
, VDEV_TYPE_REPLACING
)) {
544 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
548 is_offline
= is_removed
= is_faulted
= is_degraded
= isnt_present
= 0;
550 nvlist_find(nvlist
, ZPOOL_CONFIG_OFFLINE
, DATA_TYPE_UINT64
, NULL
,
552 nvlist_find(nvlist
, ZPOOL_CONFIG_REMOVED
, DATA_TYPE_UINT64
, NULL
,
554 nvlist_find(nvlist
, ZPOOL_CONFIG_FAULTED
, DATA_TYPE_UINT64
, NULL
,
556 nvlist_find(nvlist
, ZPOOL_CONFIG_DEGRADED
, DATA_TYPE_UINT64
, NULL
,
558 nvlist_find(nvlist
, ZPOOL_CONFIG_NOT_PRESENT
, DATA_TYPE_UINT64
, NULL
,
561 vdev
= vdev_find(guid
);
565 if (!strcmp(type
, VDEV_TYPE_MIRROR
))
566 vdev
= vdev_create(guid
, vdev_mirror_read
);
567 else if (!strcmp(type
, VDEV_TYPE_RAIDZ
))
568 vdev
= vdev_create(guid
, vdev_raidz_read
);
569 else if (!strcmp(type
, VDEV_TYPE_REPLACING
))
570 vdev
= vdev_create(guid
, vdev_replacing_read
);
572 vdev
= vdev_create(guid
, vdev_disk_read
);
575 vdev
->v_top
= pvdev
!= NULL
? pvdev
: vdev
;
576 if (nvlist_find(nvlist
, ZPOOL_CONFIG_ASHIFT
,
577 DATA_TYPE_UINT64
, NULL
, &ashift
) == 0) {
578 vdev
->v_ashift
= ashift
;
582 if (nvlist_find(nvlist
, ZPOOL_CONFIG_NPARITY
,
583 DATA_TYPE_UINT64
, NULL
, &nparity
) == 0) {
584 vdev
->v_nparity
= nparity
;
588 if (nvlist_find(nvlist
, ZPOOL_CONFIG_PATH
,
589 DATA_TYPE_STRING
, NULL
, &path
) == 0) {
590 if (strncmp(path
, "/dev/dsk/", 9) == 0)
592 vdev
->v_name
= strdup(path
);
593 if (nvlist_find(nvlist
, ZPOOL_CONFIG_PHYS_PATH
,
594 DATA_TYPE_STRING
, NULL
, &path
) == 0) {
595 vdev
->v_phys_path
= strdup(path
);
597 vdev
->v_phys_path
= NULL
;
599 if (nvlist_find(nvlist
, ZPOOL_CONFIG_DEVID
,
600 DATA_TYPE_STRING
, NULL
, &path
) == 0) {
601 vdev
->v_devid
= strdup(path
);
603 vdev
->v_devid
= NULL
;
606 if (!strcmp(type
, "raidz")) {
607 if (vdev
->v_nparity
== 1)
608 vdev
->v_name
= "raidz1";
609 else if (vdev
->v_nparity
== 2)
610 vdev
->v_name
= "raidz2";
611 else if (vdev
->v_nparity
== 3)
612 vdev
->v_name
= "raidz3";
614 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
618 vdev
->v_name
= strdup(type
);
625 if (is_new
|| is_newer
) {
627 * This is either new vdev or we've already seen this vdev,
628 * but from an older vdev label, so let's refresh its state
629 * from the newer label.
632 vdev
->v_state
= VDEV_STATE_OFFLINE
;
634 vdev
->v_state
= VDEV_STATE_REMOVED
;
636 vdev
->v_state
= VDEV_STATE_FAULTED
;
637 else if (is_degraded
)
638 vdev
->v_state
= VDEV_STATE_DEGRADED
;
639 else if (isnt_present
)
640 vdev
->v_state
= VDEV_STATE_CANT_OPEN
;
643 rc
= nvlist_find(nvlist
, ZPOOL_CONFIG_CHILDREN
, DATA_TYPE_NVLIST_ARRAY
,
646 * Its ok if we don't have any kids.
649 vdev
->v_nchildren
= nkids
;
650 for (i
= 0; i
< nkids
; i
++) {
651 rc
= vdev_init_from_nvlist(kids
, vdev
, &kid
, is_newer
);
655 STAILQ_INSERT_TAIL(&vdev
->v_children
, kid
,
657 kids
= nvlist_next(kids
);
660 vdev
->v_nchildren
= 0;
669 vdev_set_state(vdev_t
*vdev
)
676 * A mirror or raidz is healthy if all its kids are healthy. A
677 * mirror is degraded if any of its kids is healthy; a raidz
678 * is degraded if at most nparity kids are offline.
680 if (STAILQ_FIRST(&vdev
->v_children
)) {
683 STAILQ_FOREACH(kid
, &vdev
->v_children
, v_childlink
) {
684 if (kid
->v_state
== VDEV_STATE_HEALTHY
)
690 vdev
->v_state
= VDEV_STATE_HEALTHY
;
692 if (vdev
->v_read
== vdev_mirror_read
) {
694 vdev
->v_state
= VDEV_STATE_DEGRADED
;
696 vdev
->v_state
= VDEV_STATE_OFFLINE
;
698 } else if (vdev
->v_read
== vdev_raidz_read
) {
699 if (bad_kids
> vdev
->v_nparity
) {
700 vdev
->v_state
= VDEV_STATE_OFFLINE
;
702 vdev
->v_state
= VDEV_STATE_DEGRADED
;
710 spa_find_by_guid(uint64_t guid
)
714 STAILQ_FOREACH(spa
, &zfs_pools
, spa_link
)
715 if (spa
->spa_guid
== guid
)
722 spa_find_by_name(const char *name
)
726 STAILQ_FOREACH(spa
, &zfs_pools
, spa_link
)
727 if (!strcmp(spa
->spa_name
, name
))
734 spa_get_primary(void)
736 return (STAILQ_FIRST(&zfs_pools
));
740 spa_get_primary_vdev(const spa_t
*spa
)
746 spa
= spa_get_primary();
749 vdev
= STAILQ_FIRST(&spa
->spa_vdevs
);
752 for (kid
= STAILQ_FIRST(&vdev
->v_children
); kid
!= NULL
;
753 kid
= STAILQ_FIRST(&vdev
->v_children
))
759 spa_create(uint64_t guid
, const char *name
)
763 if ((spa
= malloc(sizeof(spa_t
))) == NULL
)
765 memset(spa
, 0, sizeof(spa_t
));
766 if ((spa
->spa_name
= strdup(name
)) == NULL
) {
770 STAILQ_INIT(&spa
->spa_vdevs
);
771 spa
->spa_guid
= guid
;
772 STAILQ_INSERT_TAIL(&zfs_pools
, spa
, spa_link
);
778 state_name(vdev_state_t state
)
780 static const char* names
[] = {
794 pager_printf(const char *fmt
, ...)
800 vsnprintf(line
, sizeof (line
), fmt
, args
);
802 return (pager_output(line
));
805 #define STATUS_FORMAT " %s %s\n"
808 print_state(int indent
, const char *name
, vdev_state_t state
)
814 for (i
= 0; i
< indent
; i
++)
817 return (pager_printf(STATUS_FORMAT
, buf
, state_name(state
)));
821 vdev_status(vdev_t
*vdev
, int indent
)
825 ret
= print_state(indent
, vdev
->v_name
, vdev
->v_state
);
829 STAILQ_FOREACH(kid
, &vdev
->v_children
, v_childlink
) {
830 ret
= vdev_status(kid
, indent
+ 1);
838 spa_status(spa_t
*spa
)
840 static char bootfs
[ZFS_MAXNAMELEN
];
843 int good_kids
, bad_kids
, degraded_kids
, ret
;
846 ret
= pager_printf(" pool: %s\n", spa
->spa_name
);
850 if (zfs_get_root(spa
, &rootid
) == 0 &&
851 zfs_rlookup(spa
, rootid
, bootfs
) == 0) {
852 if (bootfs
[0] == '\0')
853 ret
= pager_printf("bootfs: %s\n", spa
->spa_name
);
855 ret
= pager_printf("bootfs: %s/%s\n", spa
->spa_name
,
860 ret
= pager_printf("config:\n\n");
863 ret
= pager_printf(STATUS_FORMAT
, "NAME", "STATE");
870 STAILQ_FOREACH(vdev
, &spa
->spa_vdevs
, v_childlink
) {
871 if (vdev
->v_state
== VDEV_STATE_HEALTHY
)
873 else if (vdev
->v_state
== VDEV_STATE_DEGRADED
)
879 state
= VDEV_STATE_CLOSED
;
880 if (good_kids
> 0 && (degraded_kids
+ bad_kids
) == 0)
881 state
= VDEV_STATE_HEALTHY
;
882 else if ((good_kids
+ degraded_kids
) > 0)
883 state
= VDEV_STATE_DEGRADED
;
885 ret
= print_state(0, spa
->spa_name
, state
);
888 STAILQ_FOREACH(vdev
, &spa
->spa_vdevs
, v_childlink
) {
889 ret
= vdev_status(vdev
, 1);
900 int first
= 1, ret
= 0;
902 STAILQ_FOREACH(spa
, &zfs_pools
, spa_link
) {
904 ret
= pager_printf("\n");
909 ret
= spa_status(spa
);
917 vdev_label_offset(uint64_t psize
, int l
, uint64_t offset
)
919 uint64_t label_offset
;
921 if (l
< VDEV_LABELS
/ 2)
924 label_offset
= psize
- VDEV_LABELS
* sizeof (vdev_label_t
);
926 return (offset
+ l
* sizeof (vdev_label_t
) + label_offset
);
930 vdev_probe(vdev_phys_read_t
*phys_read
, void *read_priv
, spa_t
**spap
)
933 vdev_phys_t
*vdev_label
= (vdev_phys_t
*) zap_scratch
;
934 vdev_phys_t
*tmp_label
;
936 vdev_t
*vdev
, *top_vdev
, *pool_vdev
;
939 const unsigned char *nvlist
= NULL
;
942 uint64_t best_txg
= 0;
943 uint64_t pool_txg
, pool_guid
;
945 const char *pool_name
;
946 const unsigned char *vdevs
;
947 const unsigned char *features
;
948 int i
, l
, rc
, is_newer
;
950 const struct uberblock
*up
;
953 * Load the vdev label and figure out which
954 * uberblock is most current.
956 memset(&vtmp
, 0, sizeof(vtmp
));
957 vtmp
.v_phys_read
= phys_read
;
958 vtmp
.v_read_priv
= read_priv
;
959 psize
= P2ALIGN(ldi_get_size(read_priv
),
960 (uint64_t)sizeof (vdev_label_t
));
962 /* Test for minimum device size. */
963 if (psize
< SPA_MINDEVSIZE
)
966 tmp_label
= zfs_alloc(sizeof (vdev_phys_t
));
968 for (l
= 0; l
< VDEV_LABELS
; l
++) {
969 off
= vdev_label_offset(psize
, l
,
970 offsetof(vdev_label_t
, vl_vdev_phys
));
973 BP_SET_LSIZE(&bp
, sizeof(vdev_phys_t
));
974 BP_SET_PSIZE(&bp
, sizeof(vdev_phys_t
));
975 BP_SET_CHECKSUM(&bp
, ZIO_CHECKSUM_LABEL
);
976 BP_SET_COMPRESS(&bp
, ZIO_COMPRESS_OFF
);
977 DVA_SET_OFFSET(BP_IDENTITY(&bp
), off
);
978 ZIO_SET_CHECKSUM(&bp
.blk_cksum
, off
, 0, 0, 0);
980 if (vdev_read_phys(&vtmp
, &bp
, tmp_label
, off
, 0))
983 if (tmp_label
->vp_nvlist
[0] != NV_ENCODE_XDR
)
986 nvlist
= (const unsigned char *) tmp_label
->vp_nvlist
+ 4;
987 if (nvlist_find(nvlist
, ZPOOL_CONFIG_POOL_TXG
,
988 DATA_TYPE_UINT64
, NULL
, &pool_txg
) != 0)
991 if (best_txg
<= pool_txg
) {
993 memcpy(vdev_label
, tmp_label
, sizeof (vdev_phys_t
));
997 zfs_free(tmp_label
, sizeof (vdev_phys_t
));
1002 if (vdev_label
->vp_nvlist
[0] != NV_ENCODE_XDR
)
1005 nvlist
= (const unsigned char *) vdev_label
->vp_nvlist
+ 4;
1007 if (nvlist_find(nvlist
, ZPOOL_CONFIG_VERSION
, DATA_TYPE_UINT64
,
1012 if (!SPA_VERSION_IS_SUPPORTED(val
)) {
1013 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
1014 (unsigned) val
, (unsigned) SPA_VERSION
);
1018 /* Check ZFS features for read */
1019 if (nvlist_find(nvlist
, ZPOOL_CONFIG_FEATURES_FOR_READ
,
1020 DATA_TYPE_NVLIST
, NULL
, &features
) == 0 &&
1021 nvlist_check_features_for_read(features
) != 0) {
1025 if (nvlist_find(nvlist
, ZPOOL_CONFIG_POOL_STATE
, DATA_TYPE_UINT64
,
1030 if (val
== POOL_STATE_DESTROYED
) {
1031 /* We don't boot only from destroyed pools. */
1035 if (nvlist_find(nvlist
, ZPOOL_CONFIG_POOL_TXG
, DATA_TYPE_UINT64
,
1036 NULL
, &pool_txg
) != 0 ||
1037 nvlist_find(nvlist
, ZPOOL_CONFIG_POOL_GUID
, DATA_TYPE_UINT64
,
1038 NULL
, &pool_guid
) != 0 ||
1039 nvlist_find(nvlist
, ZPOOL_CONFIG_POOL_NAME
, DATA_TYPE_STRING
,
1040 NULL
, &pool_name
) != 0) {
1042 * Cache and spare devices end up here - just ignore
1045 /*printf("ZFS: can't find pool details\n");*/
1049 if (nvlist_find(nvlist
, ZPOOL_CONFIG_IS_LOG
, DATA_TYPE_UINT64
,
1050 NULL
, &val
) == 0 && val
!= 0) {
1055 * Create the pool if this is the first time we've seen it.
1057 spa
= spa_find_by_guid(pool_guid
);
1059 spa
= spa_create(pool_guid
, pool_name
);
1063 if (pool_txg
> spa
->spa_txg
) {
1064 spa
->spa_txg
= pool_txg
;
1071 * Get the vdev tree and create our in-core copy of it.
1072 * If we already have a vdev with this guid, this must
1073 * be some kind of alias (overlapping slices, dangerously dedicated
1076 if (nvlist_find(nvlist
, ZPOOL_CONFIG_GUID
, DATA_TYPE_UINT64
,
1077 NULL
, &guid
) != 0) {
1080 vdev
= vdev_find(guid
);
1081 if (vdev
&& vdev
->v_phys_read
) /* Has this vdev already been inited? */
1084 if (nvlist_find(nvlist
, ZPOOL_CONFIG_VDEV_TREE
, DATA_TYPE_NVLIST
,
1089 rc
= vdev_init_from_nvlist(vdevs
, NULL
, &top_vdev
, is_newer
);
1094 * Add the toplevel vdev to the pool if its not already there.
1096 STAILQ_FOREACH(pool_vdev
, &spa
->spa_vdevs
, v_childlink
)
1097 if (top_vdev
== pool_vdev
)
1099 if (!pool_vdev
&& top_vdev
) {
1100 top_vdev
->spa
= spa
;
1101 STAILQ_INSERT_TAIL(&spa
->spa_vdevs
, top_vdev
, v_childlink
);
1105 * We should already have created an incomplete vdev for this
1106 * vdev. Find it and initialise it with our read proc.
1108 vdev
= vdev_find(guid
);
1110 vdev
->v_phys_read
= phys_read
;
1111 vdev
->v_read_priv
= read_priv
;
1112 vdev
->v_state
= VDEV_STATE_HEALTHY
;
1114 printf("ZFS: inconsistent nvlist contents\n");
1118 /* Record boot vdev for spa. */
1120 spa
->spa_boot_vdev
= vdev
;
1123 * Re-evaluate top-level vdev state.
1125 vdev_set_state(top_vdev
);
1128 * Ok, we are happy with the pool so far. Lets find
1129 * the best uberblock and then we can actually access
1130 * the contents of the pool.
1132 upbuf
= zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev
));
1133 up
= (const struct uberblock
*)upbuf
;
1134 for (l
= 0; l
< VDEV_LABELS
; l
++) {
1135 for (i
= 0; i
< VDEV_UBERBLOCK_COUNT(vdev
); i
++) {
1136 off
= vdev_label_offset(psize
, l
,
1137 VDEV_UBERBLOCK_OFFSET(vdev
, i
));
1139 DVA_SET_OFFSET(&bp
.blk_dva
[0], off
);
1140 BP_SET_LSIZE(&bp
, VDEV_UBERBLOCK_SIZE(vdev
));
1141 BP_SET_PSIZE(&bp
, VDEV_UBERBLOCK_SIZE(vdev
));
1142 BP_SET_CHECKSUM(&bp
, ZIO_CHECKSUM_LABEL
);
1143 BP_SET_COMPRESS(&bp
, ZIO_COMPRESS_OFF
);
1144 ZIO_SET_CHECKSUM(&bp
.blk_cksum
, off
, 0, 0, 0);
1146 if (vdev_read_phys(vdev
, &bp
, upbuf
, off
, 0) != 0)
1149 if (up
->ub_magic
!= UBERBLOCK_MAGIC
)
1151 if (up
->ub_txg
< spa
->spa_txg
)
1153 if (up
->ub_txg
> spa
->spa_uberblock
.ub_txg
||
1154 (up
->ub_txg
== spa
->spa_uberblock
.ub_txg
&&
1156 spa
->spa_uberblock
.ub_timestamp
)) {
1157 spa
->spa_uberblock
= *up
;
1161 zfs_free(upbuf
, VDEV_UBERBLOCK_SIZE(vdev
));
1174 for (v
= 0; v
< 32; v
++)
1181 zio_read_gang(const spa_t
*spa
, const blkptr_t
*bp
, void *buf
)
1184 zio_gbh_phys_t zio_gb
;
1188 /* Artificial BP for gang block header. */
1190 BP_SET_PSIZE(&gbh_bp
, SPA_GANGBLOCKSIZE
);
1191 BP_SET_LSIZE(&gbh_bp
, SPA_GANGBLOCKSIZE
);
1192 BP_SET_CHECKSUM(&gbh_bp
, ZIO_CHECKSUM_GANG_HEADER
);
1193 BP_SET_COMPRESS(&gbh_bp
, ZIO_COMPRESS_OFF
);
1194 for (i
= 0; i
< SPA_DVAS_PER_BP
; i
++)
1195 DVA_SET_GANG(&gbh_bp
.blk_dva
[i
], 0);
1197 /* Read gang header block using the artificial BP. */
1198 if (zio_read(spa
, &gbh_bp
, &zio_gb
))
1202 for (i
= 0; i
< SPA_GBH_NBLKPTRS
; i
++) {
1203 blkptr_t
*gbp
= &zio_gb
.zg_blkptr
[i
];
1205 if (BP_IS_HOLE(gbp
))
1207 if (zio_read(spa
, gbp
, pbuf
))
1209 pbuf
+= BP_GET_PSIZE(gbp
);
1212 if (zio_checksum_verify(spa
, bp
, buf
))
1218 zio_read(const spa_t
*spa
, const blkptr_t
*bp
, void *buf
)
1220 int cpfunc
= BP_GET_COMPRESS(bp
);
1221 uint64_t align
, size
;
1226 * Process data embedded in block pointer
1228 if (BP_IS_EMBEDDED(bp
)) {
1229 ASSERT(BPE_GET_ETYPE(bp
) == BP_EMBEDDED_TYPE_DATA
);
1231 size
= BPE_GET_PSIZE(bp
);
1232 ASSERT(size
<= BPE_PAYLOAD_SIZE
);
1234 if (cpfunc
!= ZIO_COMPRESS_OFF
)
1235 pbuf
= zfs_alloc(size
);
1239 decode_embedded_bp_compressed(bp
, pbuf
);
1242 if (cpfunc
!= ZIO_COMPRESS_OFF
) {
1243 error
= zio_decompress_data(cpfunc
, pbuf
,
1244 size
, buf
, BP_GET_LSIZE(bp
));
1245 zfs_free(pbuf
, size
);
1248 printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1255 for (i
= 0; i
< SPA_DVAS_PER_BP
; i
++) {
1256 const dva_t
*dva
= &bp
->blk_dva
[i
];
1261 if (!dva
->dva_word
[0] && !dva
->dva_word
[1])
1264 vdevid
= DVA_GET_VDEV(dva
);
1265 offset
= DVA_GET_OFFSET(dva
);
1266 STAILQ_FOREACH(vdev
, &spa
->spa_vdevs
, v_childlink
) {
1267 if (vdev
->v_id
== vdevid
)
1270 if (!vdev
|| !vdev
->v_read
)
1273 size
= BP_GET_PSIZE(bp
);
1274 if (vdev
->v_read
== vdev_raidz_read
) {
1275 align
= 1ULL << vdev
->v_top
->v_ashift
;
1276 if (P2PHASE(size
, align
) != 0)
1277 size
= P2ROUNDUP(size
, align
);
1279 if (size
!= BP_GET_PSIZE(bp
) || cpfunc
!= ZIO_COMPRESS_OFF
)
1280 pbuf
= zfs_alloc(size
);
1284 if (DVA_GET_GANG(dva
))
1285 error
= zio_read_gang(spa
, bp
, pbuf
);
1287 error
= vdev
->v_read(vdev
, bp
, pbuf
, offset
, size
);
1289 if (cpfunc
!= ZIO_COMPRESS_OFF
)
1290 error
= zio_decompress_data(cpfunc
, pbuf
,
1291 BP_GET_PSIZE(bp
), buf
, BP_GET_LSIZE(bp
));
1292 else if (size
!= BP_GET_PSIZE(bp
))
1293 bcopy(pbuf
, buf
, BP_GET_PSIZE(bp
));
1296 zfs_free(pbuf
, size
);
1301 printf("ZFS: i/o error - all block copies unavailable\n");
1306 dnode_read(const spa_t
*spa
, const dnode_phys_t
*dnode
, off_t offset
, void *buf
, size_t buflen
)
1308 int ibshift
= dnode
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
1309 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1310 int nlevels
= dnode
->dn_nlevels
;
1313 if (bsize
> SPA_MAXBLOCKSIZE
) {
1314 printf("ZFS: I/O error - blocks larger than %llu are not "
1315 "supported\n", SPA_MAXBLOCKSIZE
);
1320 * Note: bsize may not be a power of two here so we need to do an
1321 * actual divide rather than a bitshift.
1323 while (buflen
> 0) {
1324 uint64_t bn
= offset
/ bsize
;
1325 int boff
= offset
% bsize
;
1327 const blkptr_t
*indbp
;
1330 if (bn
> dnode
->dn_maxblkid
) {
1331 printf("warning: zfs bug: bn %llx > dn_maxblkid %llx\n",
1332 (unsigned long long)bn
,
1333 (unsigned long long)dnode
->dn_maxblkid
);
1335 * zfs bug, will not return error
1340 if (dnode
== dnode_cache_obj
&& bn
== dnode_cache_bn
)
1343 indbp
= dnode
->dn_blkptr
;
1344 for (i
= 0; i
< nlevels
; i
++) {
1346 * Copy the bp from the indirect array so that
1347 * we can re-use the scratch buffer for multi-level
1350 ibn
= bn
>> ((nlevels
- i
- 1) * ibshift
);
1351 ibn
&= ((1 << ibshift
) - 1);
1353 if (BP_IS_HOLE(&bp
)) {
1354 memset(dnode_cache_buf
, 0, bsize
);
1357 rc
= zio_read(spa
, &bp
, dnode_cache_buf
);
1360 indbp
= (const blkptr_t
*) dnode_cache_buf
;
1362 dnode_cache_obj
= dnode
;
1363 dnode_cache_bn
= bn
;
1367 * The buffer contains our data block. Copy what we
1368 * need from it and loop.
1371 if (i
> buflen
) i
= buflen
;
1372 memcpy(buf
, &dnode_cache_buf
[boff
], i
);
1373 buf
= ((char*) buf
) + i
;
1382 * Lookup a value in a microzap directory. Assumes that the zap
1383 * scratch buffer contains the directory contents.
1386 mzap_lookup(const dnode_phys_t
*dnode
, const char *name
, uint64_t *value
)
1388 const mzap_phys_t
*mz
;
1389 const mzap_ent_phys_t
*mze
;
1394 * Microzap objects use exactly one block. Read the whole
1397 size
= dnode
->dn_datablkszsec
* 512;
1399 mz
= (const mzap_phys_t
*) zap_scratch
;
1400 chunks
= size
/ MZAP_ENT_LEN
- 1;
1402 for (i
= 0; i
< chunks
; i
++) {
1403 mze
= &mz
->mz_chunk
[i
];
1404 if (!strcmp(mze
->mze_name
, name
)) {
1405 *value
= mze
->mze_value
;
1414 * Compare a name with a zap leaf entry. Return non-zero if the name
1418 fzap_name_equal(const zap_leaf_t
*zl
, const zap_leaf_chunk_t
*zc
, const char *name
)
1421 const zap_leaf_chunk_t
*nc
;
1424 namelen
= zc
->l_entry
.le_name_numints
;
1426 nc
= &ZAP_LEAF_CHUNK(zl
, zc
->l_entry
.le_name_chunk
);
1428 while (namelen
> 0) {
1431 if (len
> ZAP_LEAF_ARRAY_BYTES
)
1432 len
= ZAP_LEAF_ARRAY_BYTES
;
1433 if (memcmp(p
, nc
->l_array
.la_array
, len
))
1437 nc
= &ZAP_LEAF_CHUNK(zl
, nc
->l_array
.la_next
);
1444 * Extract a uint64_t value from a zap leaf entry.
1447 fzap_leaf_value(const zap_leaf_t
*zl
, const zap_leaf_chunk_t
*zc
)
1449 const zap_leaf_chunk_t
*vc
;
1454 vc
= &ZAP_LEAF_CHUNK(zl
, zc
->l_entry
.le_value_chunk
);
1455 for (i
= 0, value
= 0, p
= vc
->l_array
.la_array
; i
< 8; i
++) {
1456 value
= (value
<< 8) | p
[i
];
1463 stv(int len
, void *addr
, uint64_t value
)
1467 *(uint8_t *)addr
= value
;
1470 *(uint16_t *)addr
= value
;
1473 *(uint32_t *)addr
= value
;
1476 *(uint64_t *)addr
= value
;
1482 * Extract a array from a zap leaf entry.
1485 fzap_leaf_array(const zap_leaf_t
*zl
, const zap_leaf_chunk_t
*zc
,
1486 uint64_t integer_size
, uint64_t num_integers
, void *buf
)
1488 uint64_t array_int_len
= zc
->l_entry
.le_value_intlen
;
1490 uint64_t *u64
= buf
;
1492 int len
= MIN(zc
->l_entry
.le_value_numints
, num_integers
);
1493 int chunk
= zc
->l_entry
.le_value_chunk
;
1496 if (integer_size
== 8 && len
== 1) {
1497 *u64
= fzap_leaf_value(zl
, zc
);
1502 struct zap_leaf_array
*la
= &ZAP_LEAF_CHUNK(zl
, chunk
).l_array
;
1505 ASSERT3U(chunk
, <, ZAP_LEAF_NUMCHUNKS(zl
));
1506 for (i
= 0; i
< ZAP_LEAF_ARRAY_BYTES
&& len
> 0; i
++) {
1507 value
= (value
<< 8) | la
->la_array
[i
];
1509 if (byten
== array_int_len
) {
1510 stv(integer_size
, p
, value
);
1518 chunk
= la
->la_next
;
1523 fzap_check_size(uint64_t integer_size
, uint64_t num_integers
)
1526 switch (integer_size
) {
1536 if (integer_size
* num_integers
> ZAP_MAXVALUELEN
)
1543 * Lookup a value in a fatzap directory. Assumes that the zap scratch
1544 * buffer contains the directory header.
1547 fzap_lookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, const char *name
,
1548 uint64_t integer_size
, uint64_t num_integers
, void *value
)
1550 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1551 zap_phys_t zh
= *(zap_phys_t
*) zap_scratch
;
1557 if (zh
.zap_magic
!= ZAP_MAGIC
)
1560 if ((rc
= fzap_check_size(integer_size
, num_integers
)) != 0)
1563 z
.zap_block_shift
= ilog2(bsize
);
1564 z
.zap_phys
= (zap_phys_t
*) zap_scratch
;
1567 * Figure out where the pointer table is and read it in if necessary.
1569 if (zh
.zap_ptrtbl
.zt_blk
) {
1570 rc
= dnode_read(spa
, dnode
, zh
.zap_ptrtbl
.zt_blk
* bsize
,
1571 zap_scratch
, bsize
);
1574 ptrtbl
= (uint64_t *) zap_scratch
;
1576 ptrtbl
= &ZAP_EMBEDDED_PTRTBL_ENT(&z
, 0);
1579 hash
= zap_hash(zh
.zap_salt
, name
);
1582 zl
.l_bs
= z
.zap_block_shift
;
1584 off_t off
= ptrtbl
[hash
>> (64 - zh
.zap_ptrtbl
.zt_shift
)] << zl
.l_bs
;
1585 zap_leaf_chunk_t
*zc
;
1587 rc
= dnode_read(spa
, dnode
, off
, zap_scratch
, bsize
);
1591 zl
.l_phys
= (zap_leaf_phys_t
*) zap_scratch
;
1594 * Make sure this chunk matches our hash.
1596 if (zl
.l_phys
->l_hdr
.lh_prefix_len
> 0
1597 && zl
.l_phys
->l_hdr
.lh_prefix
1598 != hash
>> (64 - zl
.l_phys
->l_hdr
.lh_prefix_len
))
1602 * Hash within the chunk to find our entry.
1604 int shift
= (64 - ZAP_LEAF_HASH_SHIFT(&zl
) - zl
.l_phys
->l_hdr
.lh_prefix_len
);
1605 int h
= (hash
>> shift
) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl
)) - 1);
1606 h
= zl
.l_phys
->l_hash
[h
];
1609 zc
= &ZAP_LEAF_CHUNK(&zl
, h
);
1610 while (zc
->l_entry
.le_hash
!= hash
) {
1611 if (zc
->l_entry
.le_next
== 0xffff) {
1615 zc
= &ZAP_LEAF_CHUNK(&zl
, zc
->l_entry
.le_next
);
1617 if (fzap_name_equal(&zl
, zc
, name
)) {
1618 if (zc
->l_entry
.le_value_intlen
> integer_size
)
1621 fzap_leaf_array(&zl
, zc
, integer_size
, num_integers
, value
);
1629 * Lookup a name in a zap object and return its value as a uint64_t.
1632 zap_lookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, const char *name
,
1633 uint64_t integer_size
, uint64_t num_integers
, void *value
)
1637 size_t size
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1639 rc
= dnode_read(spa
, dnode
, 0, zap_scratch
, size
);
1643 zap_type
= *(uint64_t *) zap_scratch
;
1644 if (zap_type
== ZBT_MICRO
)
1645 return mzap_lookup(dnode
, name
, value
);
1646 else if (zap_type
== ZBT_HEADER
) {
1647 return fzap_lookup(spa
, dnode
, name
, integer_size
,
1648 num_integers
, value
);
1650 printf("ZFS: invalid zap_type=%d\n", (int)zap_type
);
1655 * List a microzap directory. Assumes that the zap scratch buffer contains
1656 * the directory contents.
1659 mzap_list(const dnode_phys_t
*dnode
, int (*callback
)(const char *, uint64_t))
1661 const mzap_phys_t
*mz
;
1662 const mzap_ent_phys_t
*mze
;
1667 * Microzap objects use exactly one block. Read the whole
1670 size
= dnode
->dn_datablkszsec
* 512;
1671 mz
= (const mzap_phys_t
*) zap_scratch
;
1672 chunks
= size
/ MZAP_ENT_LEN
- 1;
1674 for (i
= 0; i
< chunks
; i
++) {
1675 mze
= &mz
->mz_chunk
[i
];
1676 if (mze
->mze_name
[0]) {
1677 rc
= callback(mze
->mze_name
, mze
->mze_value
);
1687 * List a fatzap directory. Assumes that the zap scratch buffer contains
1688 * the directory header.
1691 fzap_list(const spa_t
*spa
, const dnode_phys_t
*dnode
, int (*callback
)(const char *, uint64_t))
1693 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1694 zap_phys_t zh
= *(zap_phys_t
*) zap_scratch
;
1698 if (zh
.zap_magic
!= ZAP_MAGIC
)
1701 z
.zap_block_shift
= ilog2(bsize
);
1702 z
.zap_phys
= (zap_phys_t
*) zap_scratch
;
1705 * This assumes that the leaf blocks start at block 1. The
1706 * documentation isn't exactly clear on this.
1709 zl
.l_bs
= z
.zap_block_shift
;
1710 for (i
= 0; i
< zh
.zap_num_leafs
; i
++) {
1711 off_t off
= (i
+ 1) << zl
.l_bs
;
1715 if (dnode_read(spa
, dnode
, off
, zap_scratch
, bsize
))
1718 zl
.l_phys
= (zap_leaf_phys_t
*) zap_scratch
;
1720 for (j
= 0; j
< ZAP_LEAF_NUMCHUNKS(&zl
); j
++) {
1721 zap_leaf_chunk_t
*zc
, *nc
;
1724 zc
= &ZAP_LEAF_CHUNK(&zl
, j
);
1725 if (zc
->l_entry
.le_type
!= ZAP_CHUNK_ENTRY
)
1727 namelen
= zc
->l_entry
.le_name_numints
;
1728 if (namelen
> sizeof(name
))
1729 namelen
= sizeof(name
);
1732 * Paste the name back together.
1734 nc
= &ZAP_LEAF_CHUNK(&zl
, zc
->l_entry
.le_name_chunk
);
1736 while (namelen
> 0) {
1739 if (len
> ZAP_LEAF_ARRAY_BYTES
)
1740 len
= ZAP_LEAF_ARRAY_BYTES
;
1741 memcpy(p
, nc
->l_array
.la_array
, len
);
1744 nc
= &ZAP_LEAF_CHUNK(&zl
, nc
->l_array
.la_next
);
1748 * Assume the first eight bytes of the value are
1751 value
= fzap_leaf_value(&zl
, zc
);
1753 //printf("%s 0x%jx\n", name, (uintmax_t)value);
1754 rc
= callback((const char *)name
, value
);
1763 static int zfs_printf(const char *name
, uint64_t value __unused
)
1766 printf("%s\n", name
);
1772 * List a zap directory.
1775 zap_list(const spa_t
*spa
, const dnode_phys_t
*dnode
)
1778 size_t size
= dnode
->dn_datablkszsec
* 512;
1780 if (dnode_read(spa
, dnode
, 0, zap_scratch
, size
))
1783 zap_type
= *(uint64_t *) zap_scratch
;
1784 if (zap_type
== ZBT_MICRO
)
1785 return mzap_list(dnode
, zfs_printf
);
1787 return fzap_list(spa
, dnode
, zfs_printf
);
1791 objset_get_dnode(const spa_t
*spa
, const objset_phys_t
*os
, uint64_t objnum
, dnode_phys_t
*dnode
)
1795 offset
= objnum
* sizeof(dnode_phys_t
);
1796 return dnode_read(spa
, &os
->os_meta_dnode
, offset
,
1797 dnode
, sizeof(dnode_phys_t
));
1801 mzap_rlookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, char *name
, uint64_t value
)
1803 const mzap_phys_t
*mz
;
1804 const mzap_ent_phys_t
*mze
;
1809 * Microzap objects use exactly one block. Read the whole
1812 size
= dnode
->dn_datablkszsec
* 512;
1814 mz
= (const mzap_phys_t
*) zap_scratch
;
1815 chunks
= size
/ MZAP_ENT_LEN
- 1;
1817 for (i
= 0; i
< chunks
; i
++) {
1818 mze
= &mz
->mz_chunk
[i
];
1819 if (value
== mze
->mze_value
) {
1820 strcpy(name
, mze
->mze_name
);
1829 fzap_name_copy(const zap_leaf_t
*zl
, const zap_leaf_chunk_t
*zc
, char *name
)
1832 const zap_leaf_chunk_t
*nc
;
1835 namelen
= zc
->l_entry
.le_name_numints
;
1837 nc
= &ZAP_LEAF_CHUNK(zl
, zc
->l_entry
.le_name_chunk
);
1839 while (namelen
> 0) {
1842 if (len
> ZAP_LEAF_ARRAY_BYTES
)
1843 len
= ZAP_LEAF_ARRAY_BYTES
;
1844 memcpy(p
, nc
->l_array
.la_array
, len
);
1847 nc
= &ZAP_LEAF_CHUNK(zl
, nc
->l_array
.la_next
);
1854 fzap_rlookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, char *name
, uint64_t value
)
1856 int bsize
= dnode
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
1857 zap_phys_t zh
= *(zap_phys_t
*) zap_scratch
;
1861 if (zh
.zap_magic
!= ZAP_MAGIC
)
1864 z
.zap_block_shift
= ilog2(bsize
);
1865 z
.zap_phys
= (zap_phys_t
*) zap_scratch
;
1868 * This assumes that the leaf blocks start at block 1. The
1869 * documentation isn't exactly clear on this.
1872 zl
.l_bs
= z
.zap_block_shift
;
1873 for (i
= 0; i
< zh
.zap_num_leafs
; i
++) {
1874 off_t off
= (i
+ 1) << zl
.l_bs
;
1876 if (dnode_read(spa
, dnode
, off
, zap_scratch
, bsize
))
1879 zl
.l_phys
= (zap_leaf_phys_t
*) zap_scratch
;
1881 for (j
= 0; j
< ZAP_LEAF_NUMCHUNKS(&zl
); j
++) {
1882 zap_leaf_chunk_t
*zc
;
1884 zc
= &ZAP_LEAF_CHUNK(&zl
, j
);
1885 if (zc
->l_entry
.le_type
!= ZAP_CHUNK_ENTRY
)
1887 if (zc
->l_entry
.le_value_intlen
!= 8 ||
1888 zc
->l_entry
.le_value_numints
!= 1)
1891 if (fzap_leaf_value(&zl
, zc
) == value
) {
1892 fzap_name_copy(&zl
, zc
, name
);
1902 zap_rlookup(const spa_t
*spa
, const dnode_phys_t
*dnode
, char *name
, uint64_t value
)
1906 size_t size
= dnode
->dn_datablkszsec
* 512;
1908 rc
= dnode_read(spa
, dnode
, 0, zap_scratch
, size
);
1912 zap_type
= *(uint64_t *) zap_scratch
;
1913 if (zap_type
== ZBT_MICRO
)
1914 return mzap_rlookup(spa
, dnode
, name
, value
);
1916 return fzap_rlookup(spa
, dnode
, name
, value
);
1920 zfs_rlookup(const spa_t
*spa
, uint64_t objnum
, char *result
)
1923 char component
[256];
1924 uint64_t dir_obj
, parent_obj
, child_dir_zapobj
;
1925 dnode_phys_t child_dir_zap
, dataset
, dir
, parent
;
1927 dsl_dataset_phys_t
*ds
;
1931 p
= &name
[sizeof(name
) - 1];
1934 if (objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
)) {
1935 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
1938 ds
= (dsl_dataset_phys_t
*)&dataset
.dn_bonus
;
1939 dir_obj
= ds
->ds_dir_obj
;
1942 if (objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
) != 0)
1944 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
1946 /* Actual loop condition. */
1947 parent_obj
= dd
->dd_parent_obj
;
1948 if (parent_obj
== 0)
1951 if (objset_get_dnode(spa
, &spa
->spa_mos
, parent_obj
, &parent
) != 0)
1953 dd
= (dsl_dir_phys_t
*)&parent
.dn_bonus
;
1954 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
1955 if (objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
) != 0)
1957 if (zap_rlookup(spa
, &child_dir_zap
, component
, dir_obj
) != 0)
1960 len
= strlen(component
);
1962 memcpy(p
, component
, len
);
1966 /* Actual loop iteration. */
1967 dir_obj
= parent_obj
;
1978 zfs_lookup_dataset(const spa_t
*spa
, const char *name
, uint64_t *objnum
)
1981 uint64_t dir_obj
, child_dir_zapobj
;
1982 dnode_phys_t child_dir_zap
, dir
;
1986 if (objset_get_dnode(spa
, &spa
->spa_mos
, DMU_POOL_DIRECTORY_OBJECT
, &dir
))
1988 if (zap_lookup(spa
, &dir
, DMU_POOL_ROOT_DATASET
, sizeof (dir_obj
),
1994 if (objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
))
1996 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
2000 /* Actual loop condition #1. */
2006 memcpy(element
, p
, q
- p
);
2007 element
[q
- p
] = '\0';
2014 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
2015 if (objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
) != 0)
2018 /* Actual loop condition #2. */
2019 if (zap_lookup(spa
, &child_dir_zap
, element
, sizeof (dir_obj
),
2024 *objnum
= dd
->dd_head_dataset_obj
;
2028 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
2030 zfs_list_dataset(const spa_t
*spa
, uint64_t objnum
/*, int pos, char *entry*/)
2032 uint64_t dir_obj
, child_dir_zapobj
;
2033 dnode_phys_t child_dir_zap
, dir
, dataset
;
2034 dsl_dataset_phys_t
*ds
;
2037 if (objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
)) {
2038 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
2041 ds
= (dsl_dataset_phys_t
*) &dataset
.dn_bonus
;
2042 dir_obj
= ds
->ds_dir_obj
;
2044 if (objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
)) {
2045 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj
);
2048 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
2050 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
2051 if (objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
) != 0) {
2052 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj
);
2056 return (zap_list(spa
, &child_dir_zap
) != 0);
2060 zfs_callback_dataset(const spa_t
*spa
, uint64_t objnum
, int (*callback
)(const char *, uint64_t))
2062 uint64_t dir_obj
, child_dir_zapobj
, zap_type
;
2063 dnode_phys_t child_dir_zap
, dir
, dataset
;
2064 dsl_dataset_phys_t
*ds
;
2068 err
= objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
);
2070 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
2073 ds
= (dsl_dataset_phys_t
*) &dataset
.dn_bonus
;
2074 dir_obj
= ds
->ds_dir_obj
;
2076 err
= objset_get_dnode(spa
, &spa
->spa_mos
, dir_obj
, &dir
);
2078 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj
);
2081 dd
= (dsl_dir_phys_t
*)&dir
.dn_bonus
;
2083 child_dir_zapobj
= dd
->dd_child_dir_zapobj
;
2084 err
= objset_get_dnode(spa
, &spa
->spa_mos
, child_dir_zapobj
, &child_dir_zap
);
2086 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj
);
2090 err
= dnode_read(spa
, &child_dir_zap
, 0, zap_scratch
, child_dir_zap
.dn_datablkszsec
* 512);
2094 zap_type
= *(uint64_t *) zap_scratch
;
2095 if (zap_type
== ZBT_MICRO
)
2096 return mzap_list(&child_dir_zap
, callback
);
2098 return fzap_list(spa
, &child_dir_zap
, callback
);
2102 * Find the object set given the object number of its dataset object
2103 * and return its details in *objset
2106 zfs_mount_dataset(const spa_t
*spa
, uint64_t objnum
, objset_phys_t
*objset
)
2108 dnode_phys_t dataset
;
2109 dsl_dataset_phys_t
*ds
;
2111 if (objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dataset
)) {
2112 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum
);
2116 ds
= (dsl_dataset_phys_t
*) &dataset
.dn_bonus
;
2117 if (zio_read(spa
, &ds
->ds_bp
, objset
)) {
2118 printf("ZFS: can't read object set for dataset %ju\n",
2127 * Find the object set pointed to by the BOOTFS property or the root
2128 * dataset if there is none and return its details in *objset
2131 zfs_get_root(const spa_t
*spa
, uint64_t *objid
)
2133 dnode_phys_t dir
, propdir
;
2134 uint64_t props
, bootfs
, root
;
2139 * Start with the MOS directory object.
2141 if (objset_get_dnode(spa
, &spa
->spa_mos
, DMU_POOL_DIRECTORY_OBJECT
, &dir
)) {
2142 printf("ZFS: can't read MOS object directory\n");
2147 * Lookup the pool_props and see if we can find a bootfs.
2149 if (zap_lookup(spa
, &dir
, DMU_POOL_PROPS
, sizeof (props
), 1, &props
) == 0
2150 && objset_get_dnode(spa
, &spa
->spa_mos
, props
, &propdir
) == 0
2151 && zap_lookup(spa
, &propdir
, "bootfs", sizeof (bootfs
), 1, &bootfs
) == 0
2158 * Lookup the root dataset directory
2160 if (zap_lookup(spa
, &dir
, DMU_POOL_ROOT_DATASET
, sizeof (root
), 1, &root
)
2161 || objset_get_dnode(spa
, &spa
->spa_mos
, root
, &dir
)) {
2162 printf("ZFS: can't find root dsl_dir\n");
2167 * Use the information from the dataset directory's bonus buffer
2168 * to find the dataset object and from that the object set itself.
2170 dsl_dir_phys_t
*dd
= (dsl_dir_phys_t
*) &dir
.dn_bonus
;
2171 *objid
= dd
->dd_head_dataset_obj
;
2176 zfs_mount(const spa_t
*spa
, uint64_t rootobj
, struct zfsmount
*mnt
)
2182 * Find the root object set if not explicitly provided
2184 if (rootobj
== 0 && zfs_get_root(spa
, &rootobj
)) {
2185 printf("ZFS: can't find root filesystem\n");
2189 if (zfs_mount_dataset(spa
, rootobj
, &mnt
->objset
)) {
2190 printf("ZFS: can't open root filesystem\n");
2194 mnt
->rootobj
= rootobj
;
2200 * callback function for feature name checks.
2203 check_feature(const char *name
, uint64_t value
)
2209 if (name
[0] == '\0')
2212 for (i
= 0; features_for_read
[i
] != NULL
; i
++) {
2213 if (strcmp(name
, features_for_read
[i
]) == 0)
2216 printf("ZFS: unsupported feature: %s\n", name
);
2221 * Checks whether the MOS features that are active are supported.
2224 check_mos_features(const spa_t
*spa
)
2227 uint64_t objnum
, zap_type
;
2231 if ((rc
= objset_get_dnode(spa
, &spa
->spa_mos
, DMU_OT_OBJECT_DIRECTORY
,
2234 if ((rc
= zap_lookup(spa
, &dir
, DMU_POOL_FEATURES_FOR_READ
,
2235 sizeof (objnum
), 1, &objnum
)) != 0) {
2237 * It is older pool without features. As we have already
2238 * tested the label, just return without raising the error.
2245 if ((rc
= objset_get_dnode(spa
, &spa
->spa_mos
, objnum
, &dir
)) != 0)
2248 if (dir
.dn_type
!= DMU_OTN_ZAP_METADATA
)
2251 size
= dir
.dn_datablkszsec
* 512;
2252 if (dnode_read(spa
, &dir
, 0, zap_scratch
, size
))
2255 zap_type
= *(uint64_t *) zap_scratch
;
2256 if (zap_type
== ZBT_MICRO
)
2257 rc
= mzap_list(&dir
, check_feature
);
2259 rc
= fzap_list(spa
, &dir
, check_feature
);
2265 zfs_spa_init(spa_t
*spa
)
2270 if (zio_read(spa
, &spa
->spa_uberblock
.ub_rootbp
, &spa
->spa_mos
)) {
2271 printf("ZFS: can't read MOS of pool %s\n", spa
->spa_name
);
2274 if (spa
->spa_mos
.os_type
!= DMU_OST_META
) {
2275 printf("ZFS: corrupted MOS of pool %s\n", spa
->spa_name
);
2279 if (objset_get_dnode(spa
, &spa
->spa_mos
, DMU_POOL_DIRECTORY_OBJECT
,
2281 printf("ZFS: failed to read pool %s directory object\n",
2285 /* this is allowed to fail, older pools do not have salt */
2286 rc
= zap_lookup(spa
, &dir
, DMU_POOL_CHECKSUM_SALT
, 1,
2287 sizeof (spa
->spa_cksum_salt
.zcs_bytes
),
2288 spa
->spa_cksum_salt
.zcs_bytes
);
2290 rc
= check_mos_features(spa
);
2292 printf("ZFS: pool %s is not supported\n", spa
->spa_name
);
2299 zfs_dnode_stat(const spa_t
*spa
, dnode_phys_t
*dn
, struct stat
*sb
)
2302 if (dn
->dn_bonustype
!= DMU_OT_SA
) {
2303 znode_phys_t
*zp
= (znode_phys_t
*)dn
->dn_bonus
;
2305 sb
->st_mode
= zp
->zp_mode
;
2306 sb
->st_uid
= zp
->zp_uid
;
2307 sb
->st_gid
= zp
->zp_gid
;
2308 sb
->st_size
= zp
->zp_size
;
2310 sa_hdr_phys_t
*sahdrp
;
2315 if (dn
->dn_bonuslen
!= 0)
2316 sahdrp
= (sa_hdr_phys_t
*)DN_BONUS(dn
);
2318 if ((dn
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) != 0) {
2319 blkptr_t
*bp
= DN_SPILL_BLKPTR(dn
);
2322 size
= BP_GET_LSIZE(bp
);
2323 buf
= zfs_alloc(size
);
2324 error
= zio_read(spa
, bp
, buf
);
2326 zfs_free(buf
, size
);
2334 hdrsize
= SA_HDR_SIZE(sahdrp
);
2335 sb
->st_mode
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2337 sb
->st_uid
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2339 sb
->st_gid
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2341 sb
->st_size
= *(uint64_t *)((char *)sahdrp
+ hdrsize
+
2344 zfs_free(buf
, size
);
2351 zfs_dnode_readlink(const spa_t
*spa
, dnode_phys_t
*dn
, char *path
, size_t psize
)
2355 if (dn
->dn_bonustype
== DMU_OT_SA
) {
2356 sa_hdr_phys_t
*sahdrp
= NULL
;
2362 if (dn
->dn_bonuslen
!= 0)
2363 sahdrp
= (sa_hdr_phys_t
*)DN_BONUS(dn
);
2367 if ((dn
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) == 0)
2369 bp
= DN_SPILL_BLKPTR(dn
);
2371 size
= BP_GET_LSIZE(bp
);
2372 buf
= zfs_alloc(size
);
2373 rc
= zio_read(spa
, bp
, buf
);
2375 zfs_free(buf
, size
);
2380 hdrsize
= SA_HDR_SIZE(sahdrp
);
2381 p
= (char *)((uintptr_t)sahdrp
+ hdrsize
+ SA_SYMLINK_OFFSET
);
2382 memcpy(path
, p
, psize
);
2384 zfs_free(buf
, size
);
2388 * Second test is purely to silence bogus compiler
2389 * warning about accessing past the end of dn_bonus.
2391 if (psize
+ sizeof(znode_phys_t
) <= dn
->dn_bonuslen
&&
2392 sizeof(znode_phys_t
) <= sizeof(dn
->dn_bonus
)) {
2393 memcpy(path
, &dn
->dn_bonus
[sizeof(znode_phys_t
)], psize
);
2395 rc
= dnode_read(spa
, dn
, 0, path
, psize
);
2402 STAILQ_ENTRY(obj_list
) entry
;
2406 * Lookup a file and return its dnode.
2409 zfs_lookup(const struct zfsmount
*mnt
, const char *upath
, dnode_phys_t
*dnode
)
2418 int symlinks_followed
= 0;
2420 struct obj_list
*entry
, *tentry
;
2421 STAILQ_HEAD(, obj_list
) on_cache
= STAILQ_HEAD_INITIALIZER(on_cache
);
2424 if (mnt
->objset
.os_type
!= DMU_OST_ZFS
) {
2425 printf("ZFS: unexpected object set type %ju\n",
2426 (uintmax_t)mnt
->objset
.os_type
);
2430 if ((entry
= malloc(sizeof(struct obj_list
))) == NULL
)
2434 * Get the root directory dnode.
2436 rc
= objset_get_dnode(spa
, &mnt
->objset
, MASTER_NODE_OBJ
, &dn
);
2442 rc
= zap_lookup(spa
, &dn
, ZFS_ROOT_OBJ
, sizeof(objnum
), 1, &objnum
);
2447 entry
->objnum
= objnum
;
2448 STAILQ_INSERT_HEAD(&on_cache
, entry
, entry
);
2450 rc
= objset_get_dnode(spa
, &mnt
->objset
, objnum
, &dn
);
2456 rc
= objset_get_dnode(spa
, &mnt
->objset
, objnum
, &dn
);
2465 while (*q
!= '\0' && *q
!= '/')
2469 if (p
+ 1 == q
&& p
[0] == '.') {
2474 if (p
+ 2 == q
&& p
[0] == '.' && p
[1] == '.') {
2476 if (STAILQ_FIRST(&on_cache
) ==
2477 STAILQ_LAST(&on_cache
, obj_list
, entry
)) {
2481 entry
= STAILQ_FIRST(&on_cache
);
2482 STAILQ_REMOVE_HEAD(&on_cache
, entry
);
2484 objnum
= (STAILQ_FIRST(&on_cache
))->objnum
;
2487 if (q
- p
+ 1 > sizeof(element
)) {
2491 memcpy(element
, p
, q
- p
);
2495 if ((rc
= zfs_dnode_stat(spa
, &dn
, &sb
)) != 0)
2497 if (!S_ISDIR(sb
.st_mode
)) {
2502 rc
= zap_lookup(spa
, &dn
, element
, sizeof(objnum
), 1, &objnum
);
2505 objnum
= ZFS_DIRENT_OBJ(objnum
);
2507 if ((entry
= malloc(sizeof(struct obj_list
))) == NULL
) {
2511 entry
->objnum
= objnum
;
2512 STAILQ_INSERT_HEAD(&on_cache
, entry
, entry
);
2513 rc
= objset_get_dnode(spa
, &mnt
->objset
, objnum
, &dn
);
2518 * Check for symlink.
2520 rc
= zfs_dnode_stat(spa
, &dn
, &sb
);
2523 if (S_ISLNK(sb
.st_mode
)) {
2524 if (symlinks_followed
> 10) {
2528 symlinks_followed
++;
2531 * Read the link value and copy the tail of our
2532 * current path onto the end.
2534 if (sb
.st_size
+ strlen(p
) + 1 > sizeof(path
)) {
2538 strcpy(&path
[sb
.st_size
], p
);
2540 rc
= zfs_dnode_readlink(spa
, &dn
, path
, sb
.st_size
);
2545 * Restart with the new path, starting either at
2546 * the root or at the parent depending whether or
2547 * not the link is relative.
2551 while (STAILQ_FIRST(&on_cache
) !=
2552 STAILQ_LAST(&on_cache
, obj_list
, entry
)) {
2553 entry
= STAILQ_FIRST(&on_cache
);
2554 STAILQ_REMOVE_HEAD(&on_cache
, entry
);
2558 entry
= STAILQ_FIRST(&on_cache
);
2559 STAILQ_REMOVE_HEAD(&on_cache
, entry
);
2562 objnum
= (STAILQ_FIRST(&on_cache
))->objnum
;
2568 STAILQ_FOREACH_SAFE(entry
, &on_cache
, entry
, tentry
)