4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
28 static uint64_t zfs_crc64_table
[256];
32 #define ASSERT3S(x, y, z) ((void)0)
33 #define ASSERT3U(x, y, z) ((void)0)
34 #define ASSERT3P(x, y, z) ((void)0)
35 #define ASSERT0(x) ((void)0)
36 #define ASSERT(x) ((void)0)
38 #define panic(...) do { \
39 printf(__VA_ARGS__); \
43 #define kmem_alloc(size, flag) zfs_alloc((size))
44 #define kmem_free(ptr, size) zfs_free((ptr), (size))
53 * Calculate the crc64 table (used for the zap hash
56 if (zfs_crc64_table
[128] != ZFS_CRC64_POLY
) {
57 memset(zfs_crc64_table
, 0, sizeof(zfs_crc64_table
));
58 for (i
= 0; i
< 256; i
++)
59 for (ct
= zfs_crc64_table
+ i
, *ct
= i
, j
= 8; j
> 0; j
--)
60 *ct
= (*ct
>> 1) ^ (-(*ct
& 1) & ZFS_CRC64_POLY
);
65 zio_checksum_off(const void *buf
, uint64_t size
,
66 const void *ctx_template
, zio_cksum_t
*zcp
)
68 ZIO_SET_CHECKSUM(zcp
, 0, 0, 0, 0);
72 * Signature for checksum functions.
74 typedef void zio_checksum_t(const void *data
, uint64_t size
,
75 const void *ctx_template
, zio_cksum_t
*zcp
);
76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t
*salt
);
77 typedef void zio_checksum_tmpl_free_t(void *ctx_template
);
79 typedef enum zio_checksum_flags
{
80 /* Strong enough for metadata? */
81 ZCHECKSUM_FLAG_METADATA
= (1 << 1),
82 /* ZIO embedded checksum */
83 ZCHECKSUM_FLAG_EMBEDDED
= (1 << 2),
84 /* Strong enough for dedup (without verification)? */
85 ZCHECKSUM_FLAG_DEDUP
= (1 << 3),
87 ZCHECKSUM_FLAG_SALTED
= (1 << 4),
88 /* Strong enough for nopwrite? */
89 ZCHECKSUM_FLAG_NOPWRITE
= (1 << 5)
90 } zio_checksum_flags_t
;
93 * Information about each checksum function.
95 typedef struct zio_checksum_info
{
96 /* checksum function for each byteorder */
97 zio_checksum_t
*ci_func
[2];
98 zio_checksum_tmpl_init_t
*ci_tmpl_init
;
99 zio_checksum_tmpl_free_t
*ci_tmpl_free
;
100 zio_checksum_flags_t ci_flags
;
101 const char *ci_name
; /* descriptive name */
102 } zio_checksum_info_t
;
106 #include "fletcher.c"
108 #include "skein_zfs.c"
109 #include "edonr_zfs.c"
111 static zio_checksum_info_t zio_checksum_table
[ZIO_CHECKSUM_FUNCTIONS
] = {
112 {{NULL
, NULL
}, NULL
, NULL
, 0, "inherit"},
113 {{NULL
, NULL
}, NULL
, NULL
, 0, "on"},
114 {{zio_checksum_off
, zio_checksum_off
}, NULL
, NULL
, 0, "off"},
115 {{zio_checksum_SHA256
, zio_checksum_SHA256
}, NULL
, NULL
,
116 ZCHECKSUM_FLAG_METADATA
| ZCHECKSUM_FLAG_EMBEDDED
, "label"},
117 {{zio_checksum_SHA256
, zio_checksum_SHA256
}, NULL
, NULL
,
118 ZCHECKSUM_FLAG_METADATA
| ZCHECKSUM_FLAG_EMBEDDED
, "gang_header"},
119 {{fletcher_2_native
, fletcher_2_byteswap
}, NULL
, NULL
,
120 ZCHECKSUM_FLAG_EMBEDDED
, "zilog"},
121 {{fletcher_2_native
, fletcher_2_byteswap
}, NULL
, NULL
,
123 {{fletcher_4_native
, fletcher_4_byteswap
}, NULL
, NULL
,
124 ZCHECKSUM_FLAG_METADATA
, "fletcher4"},
125 {{zio_checksum_SHA256
, zio_checksum_SHA256
}, NULL
, NULL
,
126 ZCHECKSUM_FLAG_METADATA
| ZCHECKSUM_FLAG_DEDUP
|
127 ZCHECKSUM_FLAG_NOPWRITE
, "SHA256"},
128 {{fletcher_4_native
, fletcher_4_byteswap
}, NULL
, NULL
,
129 ZCHECKSUM_FLAG_EMBEDDED
, "zillog2"},
130 {{zio_checksum_off
, zio_checksum_off
}, NULL
, NULL
,
132 {{zio_checksum_SHA512_native
, zio_checksum_SHA512_byteswap
},
133 NULL
, NULL
, ZCHECKSUM_FLAG_METADATA
| ZCHECKSUM_FLAG_DEDUP
|
134 ZCHECKSUM_FLAG_NOPWRITE
, "SHA512"},
135 /* no skein and edonr for now */
136 {{zio_checksum_skein_native
, zio_checksum_skein_byteswap
},
137 zio_checksum_skein_tmpl_init
, zio_checksum_skein_tmpl_free
,
138 ZCHECKSUM_FLAG_METADATA
| ZCHECKSUM_FLAG_DEDUP
|
139 ZCHECKSUM_FLAG_SALTED
| ZCHECKSUM_FLAG_NOPWRITE
, "skein"},
140 {{zio_checksum_edonr_native
, zio_checksum_edonr_byteswap
},
141 zio_checksum_edonr_tmpl_init
, zio_checksum_edonr_tmpl_free
,
142 ZCHECKSUM_FLAG_METADATA
| ZCHECKSUM_FLAG_SALTED
|
143 ZCHECKSUM_FLAG_NOPWRITE
, "edonr"},
147 * Common signature for all zio compress/decompress functions.
149 typedef size_t zio_compress_func_t(void *src
, void *dst
,
150 size_t s_len
, size_t d_len
, int);
151 typedef int zio_decompress_func_t(void *src
, void *dst
,
152 size_t s_len
, size_t d_len
, int);
154 extern int gzip_decompress(void *src
, void *dst
,
155 size_t s_len
, size_t d_len
, int);
157 * Information about each compression function.
159 typedef struct zio_compress_info
{
160 zio_compress_func_t
*ci_compress
; /* compression function */
161 zio_decompress_func_t
*ci_decompress
; /* decompression function */
162 int ci_level
; /* level parameter */
163 const char *ci_name
; /* algorithm name */
164 } zio_compress_info_t
;
171 * Compression vectors.
173 static zio_compress_info_t zio_compress_table
[ZIO_COMPRESS_FUNCTIONS
] = {
174 {NULL
, NULL
, 0, "inherit"},
175 {NULL
, NULL
, 0, "on"},
176 {NULL
, NULL
, 0, "uncompressed"},
177 {NULL
, lzjb_decompress
, 0, "lzjb"},
178 {NULL
, NULL
, 0, "empty"},
179 {NULL
, gzip_decompress
, 1, "gzip-1"},
180 {NULL
, gzip_decompress
, 2, "gzip-2"},
181 {NULL
, gzip_decompress
, 3, "gzip-3"},
182 {NULL
, gzip_decompress
, 4, "gzip-4"},
183 {NULL
, gzip_decompress
, 5, "gzip-5"},
184 {NULL
, gzip_decompress
, 6, "gzip-6"},
185 {NULL
, gzip_decompress
, 7, "gzip-7"},
186 {NULL
, gzip_decompress
, 8, "gzip-8"},
187 {NULL
, gzip_decompress
, 9, "gzip-9"},
188 {NULL
, zle_decompress
, 64, "zle"},
189 {NULL
, lz4_decompress
, 0, "lz4"},
193 byteswap_uint64_array(void *vbuf
, size_t size
)
195 uint64_t *buf
= vbuf
;
196 size_t count
= size
>> 3;
199 ASSERT((size
& 7) == 0);
201 for (i
= 0; i
< count
; i
++)
202 buf
[i
] = BSWAP_64(buf
[i
]);
206 * Set the external verifier for a gang block based on <vdev, offset, txg>,
207 * a tuple which is guaranteed to be unique for the life of the pool.
210 zio_checksum_gang_verifier(zio_cksum_t
*zcp
, const blkptr_t
*bp
)
212 const dva_t
*dva
= BP_IDENTITY(bp
);
213 uint64_t txg
= BP_PHYSICAL_BIRTH(bp
);
215 ASSERT(BP_IS_GANG(bp
));
217 ZIO_SET_CHECKSUM(zcp
, DVA_GET_VDEV(dva
), DVA_GET_OFFSET(dva
), txg
, 0);
221 * Set the external verifier for a label block based on its offset.
222 * The vdev is implicit, and the txg is unknowable at pool open time --
223 * hence the logic in vdev_uberblock_load() to find the most recent copy.
226 zio_checksum_label_verifier(zio_cksum_t
*zcp
, uint64_t offset
)
228 ZIO_SET_CHECKSUM(zcp
, offset
, 0, 0, 0);
232 * Calls the template init function of a checksum which supports context
233 * templates and installs the template into the spa_t.
236 zio_checksum_template_init(enum zio_checksum checksum
, spa_t
*spa
)
238 zio_checksum_info_t
*ci
= &zio_checksum_table
[checksum
];
240 if (ci
->ci_tmpl_init
== NULL
)
243 if (spa
->spa_cksum_tmpls
[checksum
] != NULL
)
246 if (spa
->spa_cksum_tmpls
[checksum
] == NULL
) {
247 spa
->spa_cksum_tmpls
[checksum
] =
248 ci
->ci_tmpl_init(&spa
->spa_cksum_salt
);
253 * Called by a spa_t that's about to be deallocated. This steps through
254 * all of the checksum context templates and deallocates any that were
255 * initialized using the algorithm-specific template init function.
258 zio_checksum_templates_free(spa_t
*spa
)
260 for (enum zio_checksum checksum
= 0;
261 checksum
< ZIO_CHECKSUM_FUNCTIONS
; checksum
++) {
262 if (spa
->spa_cksum_tmpls
[checksum
] != NULL
) {
263 zio_checksum_info_t
*ci
= &zio_checksum_table
[checksum
];
265 ci
->ci_tmpl_free(spa
->spa_cksum_tmpls
[checksum
]);
266 spa
->spa_cksum_tmpls
[checksum
] = NULL
;
272 zio_checksum_verify(const spa_t
*spa
, const blkptr_t
*bp
, void *data
)
275 unsigned int checksum
;
276 zio_checksum_info_t
*ci
;
278 zio_cksum_t actual_cksum
, expected_cksum
, verifier
;
281 checksum
= BP_GET_CHECKSUM(bp
);
282 size
= BP_GET_PSIZE(bp
);
284 if (checksum
>= ZIO_CHECKSUM_FUNCTIONS
)
286 ci
= &zio_checksum_table
[checksum
];
287 if (ci
->ci_func
[0] == NULL
|| ci
->ci_func
[1] == NULL
)
291 zio_checksum_template_init(checksum
, (spa_t
*) spa
);
292 ctx
= spa
->spa_cksum_tmpls
[checksum
];
295 if (ci
->ci_flags
& ZCHECKSUM_FLAG_EMBEDDED
) {
298 ASSERT(checksum
== ZIO_CHECKSUM_GANG_HEADER
||
299 checksum
== ZIO_CHECKSUM_LABEL
);
301 eck
= (zio_eck_t
*)((char *)data
+ size
) - 1;
303 if (checksum
== ZIO_CHECKSUM_GANG_HEADER
)
304 zio_checksum_gang_verifier(&verifier
, bp
);
305 else if (checksum
== ZIO_CHECKSUM_LABEL
)
306 zio_checksum_label_verifier(&verifier
,
307 DVA_GET_OFFSET(BP_IDENTITY(bp
)));
309 verifier
= bp
->blk_cksum
;
311 byteswap
= (eck
->zec_magic
== BSWAP_64(ZEC_MAGIC
));
314 byteswap_uint64_array(&verifier
, sizeof (zio_cksum_t
));
316 expected_cksum
= eck
->zec_cksum
;
317 eck
->zec_cksum
= verifier
;
318 ci
->ci_func
[byteswap
](data
, size
, ctx
, &actual_cksum
);
319 eck
->zec_cksum
= expected_cksum
;
322 byteswap_uint64_array(&expected_cksum
,
323 sizeof (zio_cksum_t
));
325 expected_cksum
= bp
->blk_cksum
;
326 ci
->ci_func
[0](data
, size
, ctx
, &actual_cksum
);
329 if (!ZIO_CHECKSUM_EQUAL(actual_cksum
, expected_cksum
)) {
330 /* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
338 zio_decompress_data(int cpfunc
, void *src
, uint64_t srcsize
,
339 void *dest
, uint64_t destsize
)
341 zio_compress_info_t
*ci
;
343 if (cpfunc
>= ZIO_COMPRESS_FUNCTIONS
) {
344 printf("ZFS: unsupported compression algorithm %u\n", cpfunc
);
348 ci
= &zio_compress_table
[cpfunc
];
349 if (!ci
->ci_decompress
) {
350 printf("ZFS: unsupported compression algorithm %s\n",
355 return (ci
->ci_decompress(src
, dest
, srcsize
, destsize
, ci
->ci_level
));
359 zap_hash(uint64_t salt
, const char *name
)
366 ASSERT(zfs_crc64_table
[128] == ZFS_CRC64_POLY
);
367 for (cp
= (const uint8_t *)name
; (c
= *cp
) != '\0'; cp
++)
368 crc
= (crc
>> 8) ^ zfs_crc64_table
[(crc
^ c
) & 0xFF];
371 * Only use 28 bits, since we need 4 bits in the cookie for the
372 * collision differentiator. We MUST use the high bits, since
373 * those are the onces that we first pay attention to when
374 * chosing the bucket.
376 crc
&= ~((1ULL << (64 - ZAP_HASHBITS
)) - 1);
381 static void *zfs_alloc(size_t size
);
382 static void zfs_free(void *ptr
, size_t size
);
384 typedef struct raidz_col
{
385 uint64_t rc_devidx
; /* child device index for I/O */
386 uint64_t rc_offset
; /* device offset */
387 uint64_t rc_size
; /* I/O size */
388 void *rc_data
; /* I/O data */
389 int rc_error
; /* I/O error for this device */
390 uint8_t rc_tried
; /* Did we attempt this I/O column? */
391 uint8_t rc_skipped
; /* Did we skip this I/O column? */
394 typedef struct raidz_map
{
395 uint64_t rm_cols
; /* Regular column count */
396 uint64_t rm_scols
; /* Count including skipped columns */
397 uint64_t rm_bigcols
; /* Number of oversized columns */
398 uint64_t rm_asize
; /* Actual total I/O size */
399 uint64_t rm_missingdata
; /* Count of missing data devices */
400 uint64_t rm_missingparity
; /* Count of missing parity devices */
401 uint64_t rm_firstdatacol
; /* First data column/parity count */
402 uint64_t rm_nskip
; /* Skipped sectors for padding */
403 uint64_t rm_skipstart
; /* Column index of padding start */
404 uintptr_t rm_reports
; /* # of referencing checksum reports */
405 uint8_t rm_freed
; /* map no longer has referencing ZIO */
406 uint8_t rm_ecksuminjected
; /* checksum error was injected */
407 raidz_col_t rm_col
[1]; /* Flexible array of I/O columns */
410 #define VDEV_RAIDZ_P 0
411 #define VDEV_RAIDZ_Q 1
412 #define VDEV_RAIDZ_R 2
414 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
415 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
418 * We provide a mechanism to perform the field multiplication operation on a
419 * 64-bit value all at once rather than a byte at a time. This works by
420 * creating a mask from the top bit in each byte and using that to
421 * conditionally apply the XOR of 0x1d.
423 #define VDEV_RAIDZ_64MUL_2(x, mask) \
425 (mask) = (x) & 0x8080808080808080ULL; \
426 (mask) = ((mask) << 1) - ((mask) >> 7); \
427 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
428 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
431 #define VDEV_RAIDZ_64MUL_4(x, mask) \
433 VDEV_RAIDZ_64MUL_2((x), mask); \
434 VDEV_RAIDZ_64MUL_2((x), mask); \
438 * These two tables represent powers and logs of 2 in the Galois field defined
439 * above. These values were computed by repeatedly multiplying by 2 as above.
441 static const uint8_t vdev_raidz_pow2
[256] = {
442 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
443 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
444 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
445 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
446 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
447 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
448 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
449 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
450 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
451 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
452 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
453 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
454 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
455 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
456 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
457 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
458 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
459 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
460 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
461 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
462 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
463 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
464 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
465 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
466 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
467 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
468 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
469 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
470 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
471 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
472 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
473 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
475 static const uint8_t vdev_raidz_log2
[256] = {
476 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
477 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
478 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
479 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
480 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
481 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
482 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
483 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
484 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
485 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
486 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
487 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
488 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
489 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
490 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
491 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
492 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
493 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
494 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
495 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
496 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
497 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
498 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
499 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
500 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
501 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
502 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
503 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
504 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
505 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
506 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
507 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
511 * Multiply a given number by 2 raised to the given power.
514 vdev_raidz_exp2(uint8_t a
, int exp
)
520 ASSERT(vdev_raidz_log2
[a
] > 0 || a
== 1);
522 exp
+= vdev_raidz_log2
[a
];
526 return (vdev_raidz_pow2
[exp
]);
530 vdev_raidz_generate_parity_p(raidz_map_t
*rm
)
532 uint64_t *p
, *src
, pcount
__attribute__((unused
)), ccount
, i
;
535 pcount
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
537 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
538 src
= rm
->rm_col
[c
].rc_data
;
539 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
540 ccount
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
542 if (c
== rm
->rm_firstdatacol
) {
543 ASSERT(ccount
== pcount
);
544 for (i
= 0; i
< ccount
; i
++, src
++, p
++) {
548 ASSERT(ccount
<= pcount
);
549 for (i
= 0; i
< ccount
; i
++, src
++, p
++) {
557 vdev_raidz_generate_parity_pq(raidz_map_t
*rm
)
559 uint64_t *p
, *q
, *src
, pcnt
, ccnt
, mask
, i
;
562 pcnt
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
563 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
564 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
566 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
567 src
= rm
->rm_col
[c
].rc_data
;
568 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
569 q
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
571 ccnt
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
573 if (c
== rm
->rm_firstdatacol
) {
574 ASSERT(ccnt
== pcnt
|| ccnt
== 0);
575 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++) {
579 for (; i
< pcnt
; i
++, src
++, p
++, q
++) {
584 ASSERT(ccnt
<= pcnt
);
587 * Apply the algorithm described above by multiplying
588 * the previous result and adding in the new value.
590 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++) {
593 VDEV_RAIDZ_64MUL_2(*q
, mask
);
598 * Treat short columns as though they are full of 0s.
599 * Note that there's therefore nothing needed for P.
601 for (; i
< pcnt
; i
++, q
++) {
602 VDEV_RAIDZ_64MUL_2(*q
, mask
);
609 vdev_raidz_generate_parity_pqr(raidz_map_t
*rm
)
611 uint64_t *p
, *q
, *r
, *src
, pcnt
, ccnt
, mask
, i
;
614 pcnt
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
615 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
616 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
617 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
618 rm
->rm_col
[VDEV_RAIDZ_R
].rc_size
);
620 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
621 src
= rm
->rm_col
[c
].rc_data
;
622 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
623 q
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
624 r
= rm
->rm_col
[VDEV_RAIDZ_R
].rc_data
;
626 ccnt
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
628 if (c
== rm
->rm_firstdatacol
) {
629 ASSERT(ccnt
== pcnt
|| ccnt
== 0);
630 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++, r
++) {
635 for (; i
< pcnt
; i
++, src
++, p
++, q
++, r
++) {
641 ASSERT(ccnt
<= pcnt
);
644 * Apply the algorithm described above by multiplying
645 * the previous result and adding in the new value.
647 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++, r
++) {
650 VDEV_RAIDZ_64MUL_2(*q
, mask
);
653 VDEV_RAIDZ_64MUL_4(*r
, mask
);
658 * Treat short columns as though they are full of 0s.
659 * Note that there's therefore nothing needed for P.
661 for (; i
< pcnt
; i
++, q
++, r
++) {
662 VDEV_RAIDZ_64MUL_2(*q
, mask
);
663 VDEV_RAIDZ_64MUL_4(*r
, mask
);
670 * Generate RAID parity in the first virtual columns according to the number of
671 * parity columns available.
674 vdev_raidz_generate_parity(raidz_map_t
*rm
)
676 switch (rm
->rm_firstdatacol
) {
678 vdev_raidz_generate_parity_p(rm
);
681 vdev_raidz_generate_parity_pq(rm
);
684 vdev_raidz_generate_parity_pqr(rm
);
687 panic("invalid RAID-Z configuration");
693 * In the general case of reconstruction, we must solve the system of linear
694 * equations defined by the coeffecients used to generate parity as well as
695 * the contents of the data and parity disks. This can be expressed with
696 * vectors for the original data (D) and the actual data (d) and parity (p)
697 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
701 * | V | | D_0 | | p_m-1 |
702 * | | x | : | = | d_0 |
703 * | I | | D_n-1 | | : |
704 * | | ~~ ~~ | d_n-1 |
707 * I is simply a square identity matrix of size n, and V is a vandermonde
708 * matrix defined by the coeffecients we chose for the various parity columns
709 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
710 * computation as well as linear separability.
713 * | 1 .. 1 1 1 | | p_0 |
714 * | 2^n-1 .. 4 2 1 | __ __ | : |
715 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
716 * | 1 .. 0 0 0 | | D_1 | | d_0 |
717 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
718 * | : : : : | | : | | d_2 |
719 * | 0 .. 1 0 0 | | D_n-1 | | : |
720 * | 0 .. 0 1 0 | ~~ ~~ | : |
721 * | 0 .. 0 0 1 | | d_n-1 |
724 * Note that I, V, d, and p are known. To compute D, we must invert the
725 * matrix and use the known data and parity values to reconstruct the unknown
726 * data values. We begin by removing the rows in V|I and d|p that correspond
727 * to failed or missing columns; we then make V|I square (n x n) and d|p
728 * sized n by removing rows corresponding to unused parity from the bottom up
729 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
730 * using Gauss-Jordan elimination. In the example below we use m=3 parity
731 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
733 * | 1 1 1 1 1 1 1 1 |
734 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
735 * | 19 205 116 29 64 16 4 1 | / /
736 * | 1 0 0 0 0 0 0 0 | / /
737 * | 0 1 0 0 0 0 0 0 | <--' /
738 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
739 * | 0 0 0 1 0 0 0 0 |
740 * | 0 0 0 0 1 0 0 0 |
741 * | 0 0 0 0 0 1 0 0 |
742 * | 0 0 0 0 0 0 1 0 |
743 * | 0 0 0 0 0 0 0 1 |
746 * | 1 1 1 1 1 1 1 1 |
747 * | 128 64 32 16 8 4 2 1 |
748 * | 19 205 116 29 64 16 4 1 |
749 * | 1 0 0 0 0 0 0 0 |
750 * | 0 1 0 0 0 0 0 0 |
751 * (V|I)' = | 0 0 1 0 0 0 0 0 |
752 * | 0 0 0 1 0 0 0 0 |
753 * | 0 0 0 0 1 0 0 0 |
754 * | 0 0 0 0 0 1 0 0 |
755 * | 0 0 0 0 0 0 1 0 |
756 * | 0 0 0 0 0 0 0 1 |
759 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
760 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
761 * matrix is not singular.
763 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
764 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
765 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
766 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
767 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
768 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
769 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
770 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
773 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
774 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
775 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
776 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
777 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
778 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
779 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
780 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
783 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
784 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
785 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
786 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
787 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
788 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
789 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
790 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
793 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
794 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
795 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
796 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
797 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
798 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
799 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
800 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
803 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
804 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
805 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
806 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
807 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
808 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
809 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
810 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
813 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
814 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
815 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
816 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
817 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
818 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
819 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
820 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
823 * | 0 0 1 0 0 0 0 0 |
824 * | 167 100 5 41 159 169 217 208 |
825 * | 166 100 4 40 158 168 216 209 |
826 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
827 * | 0 0 0 0 1 0 0 0 |
828 * | 0 0 0 0 0 1 0 0 |
829 * | 0 0 0 0 0 0 1 0 |
830 * | 0 0 0 0 0 0 0 1 |
833 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
834 * of the missing data.
836 * As is apparent from the example above, the only non-trivial rows in the
837 * inverse matrix correspond to the data disks that we're trying to
838 * reconstruct. Indeed, those are the only rows we need as the others would
839 * only be useful for reconstructing data known or assumed to be valid. For
840 * that reason, we only build the coefficients in the rows that correspond to
846 vdev_raidz_matrix_init(raidz_map_t
*rm
, int n
, int nmap
, int *map
,
852 ASSERT(n
== rm
->rm_cols
- rm
->rm_firstdatacol
);
855 * Fill in the missing rows of interest.
857 for (i
= 0; i
< nmap
; i
++) {
858 ASSERT3S(0, <=, map
[i
]);
859 ASSERT3S(map
[i
], <=, 2);
866 for (j
= 0; j
< n
; j
++) {
870 rows
[i
][j
] = vdev_raidz_pow2
[pow
];
876 vdev_raidz_matrix_invert(raidz_map_t
*rm
, int n
, int nmissing
, int *missing
,
877 uint8_t **rows
, uint8_t **invrows
, const uint8_t *used
)
883 * Assert that the first nmissing entries from the array of used
884 * columns correspond to parity columns and that subsequent entries
885 * correspond to data columns.
887 for (i
= 0; i
< nmissing
; i
++) {
888 ASSERT3S(used
[i
], <, rm
->rm_firstdatacol
);
891 ASSERT3S(used
[i
], >=, rm
->rm_firstdatacol
);
895 * First initialize the storage where we'll compute the inverse rows.
897 for (i
= 0; i
< nmissing
; i
++) {
898 for (j
= 0; j
< n
; j
++) {
899 invrows
[i
][j
] = (i
== j
) ? 1 : 0;
904 * Subtract all trivial rows from the rows of consequence.
906 for (i
= 0; i
< nmissing
; i
++) {
907 for (j
= nmissing
; j
< n
; j
++) {
908 ASSERT3U(used
[j
], >=, rm
->rm_firstdatacol
);
909 jj
= used
[j
] - rm
->rm_firstdatacol
;
911 invrows
[i
][j
] = rows
[i
][jj
];
917 * For each of the rows of interest, we must normalize it and subtract
918 * a multiple of it from the other rows.
920 for (i
= 0; i
< nmissing
; i
++) {
921 for (j
= 0; j
< missing
[i
]; j
++) {
922 ASSERT3U(rows
[i
][j
], ==, 0);
924 ASSERT3U(rows
[i
][missing
[i
]], !=, 0);
927 * Compute the inverse of the first element and multiply each
928 * element in the row by that value.
930 log
= 255 - vdev_raidz_log2
[rows
[i
][missing
[i
]]];
932 for (j
= 0; j
< n
; j
++) {
933 rows
[i
][j
] = vdev_raidz_exp2(rows
[i
][j
], log
);
934 invrows
[i
][j
] = vdev_raidz_exp2(invrows
[i
][j
], log
);
937 for (ii
= 0; ii
< nmissing
; ii
++) {
941 ASSERT3U(rows
[ii
][missing
[i
]], !=, 0);
943 log
= vdev_raidz_log2
[rows
[ii
][missing
[i
]]];
945 for (j
= 0; j
< n
; j
++) {
947 vdev_raidz_exp2(rows
[i
][j
], log
);
949 vdev_raidz_exp2(invrows
[i
][j
], log
);
955 * Verify that the data that is left in the rows are properly part of
956 * an identity matrix.
958 for (i
= 0; i
< nmissing
; i
++) {
959 for (j
= 0; j
< n
; j
++) {
960 if (j
== missing
[i
]) {
961 ASSERT3U(rows
[i
][j
], ==, 1);
963 ASSERT3U(rows
[i
][j
], ==, 0);
970 vdev_raidz_matrix_reconstruct(raidz_map_t
*rm
, int n
, int nmissing
,
971 int *missing
, uint8_t **invrows
, const uint8_t *used
)
976 uint8_t *dst
[VDEV_RAIDZ_MAXPARITY
];
977 uint64_t dcount
[VDEV_RAIDZ_MAXPARITY
];
980 uint8_t *invlog
[VDEV_RAIDZ_MAXPARITY
];
985 psize
= sizeof (invlog
[0][0]) * n
* nmissing
;
986 p
= zfs_alloc(psize
);
988 for (pp
= p
, i
= 0; i
< nmissing
; i
++) {
993 for (i
= 0; i
< nmissing
; i
++) {
994 for (j
= 0; j
< n
; j
++) {
995 ASSERT3U(invrows
[i
][j
], !=, 0);
996 invlog
[i
][j
] = vdev_raidz_log2
[invrows
[i
][j
]];
1000 for (i
= 0; i
< n
; i
++) {
1002 ASSERT3U(c
, <, rm
->rm_cols
);
1004 src
= rm
->rm_col
[c
].rc_data
;
1005 ccount
= rm
->rm_col
[c
].rc_size
;
1006 for (j
= 0; j
< nmissing
; j
++) {
1007 cc
= missing
[j
] + rm
->rm_firstdatacol
;
1008 ASSERT3U(cc
, >=, rm
->rm_firstdatacol
);
1009 ASSERT3U(cc
, <, rm
->rm_cols
);
1010 ASSERT3U(cc
, !=, c
);
1012 dst
[j
] = rm
->rm_col
[cc
].rc_data
;
1013 dcount
[j
] = rm
->rm_col
[cc
].rc_size
;
1016 ASSERT(ccount
>= rm
->rm_col
[missing
[0]].rc_size
|| i
> 0);
1018 for (x
= 0; x
< ccount
; x
++, src
++) {
1020 log
= vdev_raidz_log2
[*src
];
1022 for (cc
= 0; cc
< nmissing
; cc
++) {
1023 if (x
>= dcount
[cc
])
1029 if ((ll
= log
+ invlog
[cc
][i
]) >= 255)
1031 val
= vdev_raidz_pow2
[ll
];
1046 vdev_raidz_reconstruct_general(raidz_map_t
*rm
, int *tgts
, int ntgts
)
1050 int missing_rows
[VDEV_RAIDZ_MAXPARITY
];
1051 int parity_map
[VDEV_RAIDZ_MAXPARITY
];
1056 uint8_t *rows
[VDEV_RAIDZ_MAXPARITY
];
1057 uint8_t *invrows
[VDEV_RAIDZ_MAXPARITY
];
1063 n
= rm
->rm_cols
- rm
->rm_firstdatacol
;
1066 * Figure out which data columns are missing.
1069 for (t
= 0; t
< ntgts
; t
++) {
1070 if (tgts
[t
] >= rm
->rm_firstdatacol
) {
1071 missing_rows
[nmissing_rows
++] =
1072 tgts
[t
] - rm
->rm_firstdatacol
;
1077 * Figure out which parity columns to use to help generate the missing
1080 for (tt
= 0, c
= 0, i
= 0; i
< nmissing_rows
; c
++) {
1082 ASSERT(c
< rm
->rm_firstdatacol
);
1085 * Skip any targeted parity columns.
1087 if (c
== tgts
[tt
]) {
1099 ASSERT3U(code
, <, 1 << VDEV_RAIDZ_MAXPARITY
);
1101 psize
= (sizeof (rows
[0][0]) + sizeof (invrows
[0][0])) *
1102 nmissing_rows
* n
+ sizeof (used
[0]) * n
;
1103 p
= kmem_alloc(psize
, KM_SLEEP
);
1105 for (pp
= p
, i
= 0; i
< nmissing_rows
; i
++) {
1113 for (i
= 0; i
< nmissing_rows
; i
++) {
1114 used
[i
] = parity_map
[i
];
1117 for (tt
= 0, c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
1118 if (tt
< nmissing_rows
&&
1119 c
== missing_rows
[tt
] + rm
->rm_firstdatacol
) {
1130 * Initialize the interesting rows of the matrix.
1132 vdev_raidz_matrix_init(rm
, n
, nmissing_rows
, parity_map
, rows
);
1135 * Invert the matrix.
1137 vdev_raidz_matrix_invert(rm
, n
, nmissing_rows
, missing_rows
, rows
,
1141 * Reconstruct the missing data using the generated matrix.
1143 vdev_raidz_matrix_reconstruct(rm
, n
, nmissing_rows
, missing_rows
,
1146 kmem_free(p
, psize
);
1152 vdev_raidz_reconstruct(raidz_map_t
*rm
, int *t
, int nt
)
1154 int tgts
[VDEV_RAIDZ_MAXPARITY
];
1158 int nbadparity
, nbaddata
;
1161 * The tgts list must already be sorted.
1163 for (i
= 1; i
< nt
; i
++) {
1164 ASSERT(t
[i
] > t
[i
- 1]);
1167 nbadparity
= rm
->rm_firstdatacol
;
1168 nbaddata
= rm
->rm_cols
- nbadparity
;
1170 for (i
= 0, c
= 0; c
< rm
->rm_cols
; c
++) {
1171 if (i
< nt
&& c
== t
[i
]) {
1174 } else if (rm
->rm_col
[c
].rc_error
!= 0) {
1176 } else if (c
>= rm
->rm_firstdatacol
) {
1183 ASSERT(ntgts
>= nt
);
1184 ASSERT(nbaddata
>= 0);
1185 ASSERT(nbaddata
+ nbadparity
== ntgts
);
1187 code
= vdev_raidz_reconstruct_general(rm
, tgts
, ntgts
);
1188 ASSERT(code
< (1 << VDEV_RAIDZ_MAXPARITY
));
1193 static raidz_map_t
*
1194 vdev_raidz_map_alloc(void *data
, off_t offset
, size_t size
, uint64_t unit_shift
,
1195 uint64_t dcols
, uint64_t nparity
)
1198 uint64_t b
= offset
>> unit_shift
;
1199 uint64_t s
= size
>> unit_shift
;
1200 uint64_t f
= b
% dcols
;
1201 uint64_t o
= (b
/ dcols
) << unit_shift
;
1202 uint64_t q
, r
, c
, bc
, col
, acols
, scols
, coff
, devidx
, asize
, tot
;
1204 q
= s
/ (dcols
- nparity
);
1205 r
= s
- q
* (dcols
- nparity
);
1206 bc
= (r
== 0 ? 0 : r
+ nparity
);
1207 tot
= s
+ nparity
* (q
+ (r
== 0 ? 0 : 1));
1211 scols
= MIN(dcols
, roundup(bc
, nparity
+ 1));
1217 ASSERT3U(acols
, <=, scols
);
1219 rm
= zfs_alloc(offsetof(raidz_map_t
, rm_col
[scols
]));
1221 rm
->rm_cols
= acols
;
1222 rm
->rm_scols
= scols
;
1223 rm
->rm_bigcols
= bc
;
1224 rm
->rm_skipstart
= bc
;
1225 rm
->rm_missingdata
= 0;
1226 rm
->rm_missingparity
= 0;
1227 rm
->rm_firstdatacol
= nparity
;
1230 rm
->rm_ecksuminjected
= 0;
1234 for (c
= 0; c
< scols
; c
++) {
1239 coff
+= 1ULL << unit_shift
;
1241 rm
->rm_col
[c
].rc_devidx
= col
;
1242 rm
->rm_col
[c
].rc_offset
= coff
;
1243 rm
->rm_col
[c
].rc_data
= NULL
;
1244 rm
->rm_col
[c
].rc_error
= 0;
1245 rm
->rm_col
[c
].rc_tried
= 0;
1246 rm
->rm_col
[c
].rc_skipped
= 0;
1249 rm
->rm_col
[c
].rc_size
= 0;
1251 rm
->rm_col
[c
].rc_size
= (q
+ 1) << unit_shift
;
1253 rm
->rm_col
[c
].rc_size
= q
<< unit_shift
;
1255 asize
+= rm
->rm_col
[c
].rc_size
;
1258 ASSERT3U(asize
, ==, tot
<< unit_shift
);
1259 rm
->rm_asize
= roundup(asize
, (nparity
+ 1) << unit_shift
);
1260 rm
->rm_nskip
= roundup(tot
, nparity
+ 1) - tot
;
1261 ASSERT3U(rm
->rm_asize
- asize
, ==, rm
->rm_nskip
<< unit_shift
);
1262 ASSERT3U(rm
->rm_nskip
, <=, nparity
);
1264 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++)
1265 rm
->rm_col
[c
].rc_data
= zfs_alloc(rm
->rm_col
[c
].rc_size
);
1267 rm
->rm_col
[c
].rc_data
= data
;
1269 for (c
= c
+ 1; c
< acols
; c
++)
1270 rm
->rm_col
[c
].rc_data
= (char *)rm
->rm_col
[c
- 1].rc_data
+
1271 rm
->rm_col
[c
- 1].rc_size
;
1274 * If all data stored spans all columns, there's a danger that parity
1275 * will always be on the same device and, since parity isn't read
1276 * during normal operation, that that device's I/O bandwidth won't be
1277 * used effectively. We therefore switch the parity every 1MB.
1279 * ... at least that was, ostensibly, the theory. As a practical
1280 * matter unless we juggle the parity between all devices evenly, we
1281 * won't see any benefit. Further, occasional writes that aren't a
1282 * multiple of the LCM of the number of children and the minimum
1283 * stripe width are sufficient to avoid pessimal behavior.
1284 * Unfortunately, this decision created an implicit on-disk format
1285 * requirement that we need to support for all eternity, but only
1286 * for single-parity RAID-Z.
1288 * If we intend to skip a sector in the zeroth column for padding
1289 * we must make sure to note this swap. We will never intend to
1290 * skip the first column since at least one data and one parity
1291 * column must appear in each row.
1293 ASSERT(rm
->rm_cols
>= 2);
1294 ASSERT(rm
->rm_col
[0].rc_size
== rm
->rm_col
[1].rc_size
);
1296 if (rm
->rm_firstdatacol
== 1 && (offset
& (1ULL << 20))) {
1297 devidx
= rm
->rm_col
[0].rc_devidx
;
1298 o
= rm
->rm_col
[0].rc_offset
;
1299 rm
->rm_col
[0].rc_devidx
= rm
->rm_col
[1].rc_devidx
;
1300 rm
->rm_col
[0].rc_offset
= rm
->rm_col
[1].rc_offset
;
1301 rm
->rm_col
[1].rc_devidx
= devidx
;
1302 rm
->rm_col
[1].rc_offset
= o
;
1304 if (rm
->rm_skipstart
== 0)
1305 rm
->rm_skipstart
= 1;
1312 vdev_raidz_map_free(raidz_map_t
*rm
)
1316 for (c
= rm
->rm_firstdatacol
- 1; c
>= 0; c
--)
1317 zfs_free(rm
->rm_col
[c
].rc_data
, rm
->rm_col
[c
].rc_size
);
1319 zfs_free(rm
, offsetof(raidz_map_t
, rm_col
[rm
->rm_scols
]));
1323 vdev_child(vdev_t
*pvd
, uint64_t devidx
)
1327 STAILQ_FOREACH(cvd
, &pvd
->v_children
, v_childlink
) {
1328 if (cvd
->v_id
== devidx
)
1336 * We keep track of whether or not there were any injected errors, so that
1337 * any ereports we generate can note it.
1340 raidz_checksum_verify(const spa_t
*spa
, const blkptr_t
*bp
, void *data
,
1344 return (zio_checksum_verify(spa
, bp
, data
));
1348 * Generate the parity from the data columns. If we tried and were able to
1349 * read the parity without error, verify that the generated parity matches the
1350 * data we read. If it doesn't, we fire off a checksum error. Return the
1351 * number such failures.
1354 raidz_parity_verify(raidz_map_t
*rm
)
1356 void *orig
[VDEV_RAIDZ_MAXPARITY
];
1360 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
1361 rc
= &rm
->rm_col
[c
];
1362 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
1364 orig
[c
] = zfs_alloc(rc
->rc_size
);
1365 bcopy(rc
->rc_data
, orig
[c
], rc
->rc_size
);
1368 vdev_raidz_generate_parity(rm
);
1370 for (c
= rm
->rm_firstdatacol
- 1; c
>= 0; c
--) {
1371 rc
= &rm
->rm_col
[c
];
1372 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
1374 if (bcmp(orig
[c
], rc
->rc_data
, rc
->rc_size
) != 0) {
1375 rc
->rc_error
= ECKSUM
;
1378 zfs_free(orig
[c
], rc
->rc_size
);
1385 * Iterate over all combinations of bad data and attempt a reconstruction.
1386 * Note that the algorithm below is non-optimal because it doesn't take into
1387 * account how reconstruction is actually performed. For example, with
1388 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1389 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1390 * cases we'd only use parity information in column 0.
1393 vdev_raidz_combrec(const spa_t
*spa
, raidz_map_t
*rm
, const blkptr_t
*bp
,
1394 void *data
, off_t offset
, uint64_t bytes
, int total_errors
, int data_errors
)
1397 void *orig
[VDEV_RAIDZ_MAXPARITY
];
1398 int tstore
[VDEV_RAIDZ_MAXPARITY
+ 2];
1399 int *tgts
= &tstore
[1];
1400 int current
, next
, i
, c
, n
;
1403 ASSERT(total_errors
< rm
->rm_firstdatacol
);
1406 * This simplifies one edge condition.
1410 for (n
= 1; n
<= rm
->rm_firstdatacol
- total_errors
; n
++) {
1412 * Initialize the targets array by finding the first n columns
1413 * that contain no error.
1415 * If there were no data errors, we need to ensure that we're
1416 * always explicitly attempting to reconstruct at least one
1417 * data column. To do this, we simply push the highest target
1418 * up into the data columns.
1420 for (c
= 0, i
= 0; i
< n
; i
++) {
1421 if (i
== n
- 1 && data_errors
== 0 &&
1422 c
< rm
->rm_firstdatacol
) {
1423 c
= rm
->rm_firstdatacol
;
1426 while (rm
->rm_col
[c
].rc_error
!= 0) {
1428 ASSERT3S(c
, <, rm
->rm_cols
);
1435 * Setting tgts[n] simplifies the other edge condition.
1437 tgts
[n
] = rm
->rm_cols
;
1440 * These buffers were allocated in previous iterations.
1442 for (i
= 0; i
< n
- 1; i
++) {
1443 ASSERT(orig
[i
] != NULL
);
1446 orig
[n
- 1] = zfs_alloc(rm
->rm_col
[0].rc_size
);
1449 next
= tgts
[current
];
1451 while (current
!= n
) {
1452 tgts
[current
] = next
;
1456 * Save off the original data that we're going to
1457 * attempt to reconstruct.
1459 for (i
= 0; i
< n
; i
++) {
1460 ASSERT(orig
[i
] != NULL
);
1463 ASSERT3S(c
, <, rm
->rm_cols
);
1464 rc
= &rm
->rm_col
[c
];
1465 bcopy(rc
->rc_data
, orig
[i
], rc
->rc_size
);
1469 * Attempt a reconstruction and exit the outer loop on
1472 code
= vdev_raidz_reconstruct(rm
, tgts
, n
);
1473 if (raidz_checksum_verify(spa
, bp
, data
, bytes
) == 0) {
1474 for (i
= 0; i
< n
; i
++) {
1476 rc
= &rm
->rm_col
[c
];
1477 ASSERT(rc
->rc_error
== 0);
1478 rc
->rc_error
= ECKSUM
;
1486 * Restore the original data.
1488 for (i
= 0; i
< n
; i
++) {
1490 rc
= &rm
->rm_col
[c
];
1491 bcopy(orig
[i
], rc
->rc_data
, rc
->rc_size
);
1496 * Find the next valid column after the current
1499 for (next
= tgts
[current
] + 1;
1500 next
< rm
->rm_cols
&&
1501 rm
->rm_col
[next
].rc_error
!= 0; next
++)
1504 ASSERT(next
<= tgts
[current
+ 1]);
1507 * If that spot is available, we're done here.
1509 if (next
!= tgts
[current
+ 1])
1513 * Otherwise, find the next valid column after
1514 * the previous position.
1516 for (c
= tgts
[current
- 1] + 1;
1517 rm
->rm_col
[c
].rc_error
!= 0; c
++)
1523 } while (current
!= n
);
1528 for (i
= n
- 1; i
>= 0; i
--) {
1529 zfs_free(orig
[i
], rm
->rm_col
[0].rc_size
);
1536 vdev_raidz_read(vdev_t
*vd
, const blkptr_t
*bp
, void *data
,
1537 off_t offset
, size_t bytes
)
1539 vdev_t
*tvd
= vd
->v_top
;
1544 int unexpected_errors
;
1550 int tgts
[VDEV_RAIDZ_MAXPARITY
];
1553 rc
= NULL
; /* gcc */
1556 rm
= vdev_raidz_map_alloc(data
, offset
, bytes
, tvd
->v_ashift
,
1557 vd
->v_nchildren
, vd
->v_nparity
);
1560 * Iterate over the columns in reverse order so that we hit the parity
1561 * last -- any errors along the way will force us to read the parity.
1563 for (c
= rm
->rm_cols
- 1; c
>= 0; c
--) {
1564 rc
= &rm
->rm_col
[c
];
1565 cvd
= vdev_child(vd
, rc
->rc_devidx
);
1566 if (cvd
== NULL
|| cvd
->v_state
!= VDEV_STATE_HEALTHY
) {
1567 if (c
>= rm
->rm_firstdatacol
)
1568 rm
->rm_missingdata
++;
1570 rm
->rm_missingparity
++;
1571 rc
->rc_error
= ENXIO
;
1572 rc
->rc_tried
= 1; /* don't even try */
1576 #if 0 /* XXX: Too hard for the boot code. */
1577 if (vdev_dtl_contains(cvd
, DTL_MISSING
, zio
->io_txg
, 1)) {
1578 if (c
>= rm
->rm_firstdatacol
)
1579 rm
->rm_missingdata
++;
1581 rm
->rm_missingparity
++;
1582 rc
->rc_error
= ESTALE
;
1587 if (c
>= rm
->rm_firstdatacol
|| rm
->rm_missingdata
> 0) {
1588 rc
->rc_error
= cvd
->v_read(cvd
, NULL
, rc
->rc_data
,
1589 rc
->rc_offset
, rc
->rc_size
);
1596 unexpected_errors
= 0;
1602 ASSERT(rm
->rm_missingparity
<= rm
->rm_firstdatacol
);
1603 ASSERT(rm
->rm_missingdata
<= rm
->rm_cols
- rm
->rm_firstdatacol
);
1605 for (c
= 0; c
< rm
->rm_cols
; c
++) {
1606 rc
= &rm
->rm_col
[c
];
1609 ASSERT(rc
->rc_error
!= ECKSUM
); /* child has no bp */
1611 if (c
< rm
->rm_firstdatacol
)
1616 if (!rc
->rc_skipped
)
1617 unexpected_errors
++;
1620 } else if (c
< rm
->rm_firstdatacol
&& !rc
->rc_tried
) {
1626 * There are three potential phases for a read:
1627 * 1. produce valid data from the columns read
1628 * 2. read all disks and try again
1629 * 3. perform combinatorial reconstruction
1631 * Each phase is progressively both more expensive and less likely to
1632 * occur. If we encounter more errors than we can repair or all phases
1633 * fail, we have no choice but to return an error.
1637 * If the number of errors we saw was correctable -- less than or equal
1638 * to the number of parity disks read -- attempt to produce data that
1639 * has a valid checksum. Naturally, this case applies in the absence of
1642 if (total_errors
<= rm
->rm_firstdatacol
- parity_untried
) {
1643 if (data_errors
== 0) {
1644 if (raidz_checksum_verify(vd
->spa
, bp
, data
, bytes
) == 0) {
1646 * If we read parity information (unnecessarily
1647 * as it happens since no reconstruction was
1648 * needed) regenerate and verify the parity.
1649 * We also regenerate parity when resilvering
1650 * so we can write it out to the failed device
1653 if (parity_errors
+ parity_untried
<
1654 rm
->rm_firstdatacol
) {
1655 n
= raidz_parity_verify(rm
);
1656 unexpected_errors
+= n
;
1657 ASSERT(parity_errors
+ n
<=
1658 rm
->rm_firstdatacol
);
1664 * We either attempt to read all the parity columns or
1665 * none of them. If we didn't try to read parity, we
1666 * wouldn't be here in the correctable case. There must
1667 * also have been fewer parity errors than parity
1668 * columns or, again, we wouldn't be in this code path.
1670 ASSERT(parity_untried
== 0);
1671 ASSERT(parity_errors
< rm
->rm_firstdatacol
);
1674 * Identify the data columns that reported an error.
1677 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
1678 rc
= &rm
->rm_col
[c
];
1679 if (rc
->rc_error
!= 0) {
1680 ASSERT(n
< VDEV_RAIDZ_MAXPARITY
);
1685 ASSERT(rm
->rm_firstdatacol
>= n
);
1687 code
= vdev_raidz_reconstruct(rm
, tgts
, n
);
1689 if (raidz_checksum_verify(vd
->spa
, bp
, data
, bytes
) == 0) {
1691 * If we read more parity disks than were used
1692 * for reconstruction, confirm that the other
1693 * parity disks produced correct data. This
1694 * routine is suboptimal in that it regenerates
1695 * the parity that we already used in addition
1696 * to the parity that we're attempting to
1697 * verify, but this should be a relatively
1698 * uncommon case, and can be optimized if it
1699 * becomes a problem. Note that we regenerate
1700 * parity when resilvering so we can write it
1701 * out to failed devices later.
1703 if (parity_errors
< rm
->rm_firstdatacol
- n
) {
1704 n
= raidz_parity_verify(rm
);
1705 unexpected_errors
+= n
;
1706 ASSERT(parity_errors
+ n
<=
1707 rm
->rm_firstdatacol
);
1716 * This isn't a typical situation -- either we got a read
1717 * error or a child silently returned bad data. Read every
1718 * block so we can try again with as much data and parity as
1719 * we can track down. If we've already been through once
1720 * before, all children will be marked as tried so we'll
1721 * proceed to combinatorial reconstruction.
1723 unexpected_errors
= 1;
1724 rm
->rm_missingdata
= 0;
1725 rm
->rm_missingparity
= 0;
1728 for (c
= 0; c
< rm
->rm_cols
; c
++) {
1729 rc
= &rm
->rm_col
[c
];
1734 cvd
= vdev_child(vd
, rc
->rc_devidx
);
1735 ASSERT(cvd
!= NULL
);
1736 rc
->rc_error
= cvd
->v_read(cvd
, NULL
,
1737 rc
->rc_data
, rc
->rc_offset
, rc
->rc_size
);
1738 if (rc
->rc_error
== 0)
1744 * If we managed to read anything more, retry the
1751 * At this point we've attempted to reconstruct the data given the
1752 * errors we detected, and we've attempted to read all columns. There
1753 * must, therefore, be one or more additional problems -- silent errors
1754 * resulting in invalid data rather than explicit I/O errors resulting
1755 * in absent data. We check if there is enough additional data to
1756 * possibly reconstruct the data and then perform combinatorial
1757 * reconstruction over all possible combinations. If that fails,
1760 if (total_errors
> rm
->rm_firstdatacol
) {
1762 } else if (total_errors
< rm
->rm_firstdatacol
&&
1763 (code
= vdev_raidz_combrec(vd
->spa
, rm
, bp
, data
, offset
, bytes
,
1764 total_errors
, data_errors
)) != 0) {
1766 * If we didn't use all the available parity for the
1767 * combinatorial reconstruction, verify that the remaining
1768 * parity is correct.
1770 if (code
!= (1 << rm
->rm_firstdatacol
) - 1)
1771 (void) raidz_parity_verify(rm
);
1774 * We're here because either:
1776 * total_errors == rm_first_datacol, or
1777 * vdev_raidz_combrec() failed
1779 * In either case, there is enough bad data to prevent
1782 * Start checksum ereports for all children which haven't
1783 * failed, and the IO wasn't speculative.
1789 vdev_raidz_map_free(rm
);