loader: add skein/edonr support
[unleashed.git] / usr / src / boot / sys / cddl / boot / zfs / zfssubr.c
blobe38f14f33b00efbbc255022429e39dc068060c4c
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/cdefs.h>
28 static uint64_t zfs_crc64_table[256];
30 #define ECKSUM 666
32 #define ASSERT3S(x, y, z) ((void)0)
33 #define ASSERT3U(x, y, z) ((void)0)
34 #define ASSERT3P(x, y, z) ((void)0)
35 #define ASSERT0(x) ((void)0)
36 #define ASSERT(x) ((void)0)
38 #define panic(...) do { \
39 printf(__VA_ARGS__); \
40 for (;;) ; \
41 } while (0)
43 #define kmem_alloc(size, flag) zfs_alloc((size))
44 #define kmem_free(ptr, size) zfs_free((ptr), (size))
46 static void
47 zfs_init_crc(void)
49 int i, j;
50 uint64_t *ct;
53 * Calculate the crc64 table (used for the zap hash
54 * function).
56 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
57 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
58 for (i = 0; i < 256; i++)
59 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
60 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
64 static void
65 zio_checksum_off(const void *buf, uint64_t size,
66 const void *ctx_template, zio_cksum_t *zcp)
68 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
72 * Signature for checksum functions.
74 typedef void zio_checksum_t(const void *data, uint64_t size,
75 const void *ctx_template, zio_cksum_t *zcp);
76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
79 typedef enum zio_checksum_flags {
80 /* Strong enough for metadata? */
81 ZCHECKSUM_FLAG_METADATA = (1 << 1),
82 /* ZIO embedded checksum */
83 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
84 /* Strong enough for dedup (without verification)? */
85 ZCHECKSUM_FLAG_DEDUP = (1 << 3),
86 /* Uses salt value */
87 ZCHECKSUM_FLAG_SALTED = (1 << 4),
88 /* Strong enough for nopwrite? */
89 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
90 } zio_checksum_flags_t;
93 * Information about each checksum function.
95 typedef struct zio_checksum_info {
96 /* checksum function for each byteorder */
97 zio_checksum_t *ci_func[2];
98 zio_checksum_tmpl_init_t *ci_tmpl_init;
99 zio_checksum_tmpl_free_t *ci_tmpl_free;
100 zio_checksum_flags_t ci_flags;
101 const char *ci_name; /* descriptive name */
102 } zio_checksum_info_t;
104 #include "blkptr.c"
106 #include "fletcher.c"
107 #include "sha256.c"
108 #include "skein_zfs.c"
109 #include "edonr_zfs.c"
111 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
112 {{NULL, NULL}, NULL, NULL, 0, "inherit"},
113 {{NULL, NULL}, NULL, NULL, 0, "on"},
114 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"},
115 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
116 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
117 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
118 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
119 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
120 ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
121 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
122 0, "fletcher2"},
123 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
124 ZCHECKSUM_FLAG_METADATA, "fletcher4"},
125 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
126 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
127 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
128 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
129 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
130 {{zio_checksum_off, zio_checksum_off}, NULL, NULL,
131 0, "noparity"},
132 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
133 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
134 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
135 /* no skein and edonr for now */
136 {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
137 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
138 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
139 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
140 {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
141 zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
142 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
143 ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
147 * Common signature for all zio compress/decompress functions.
149 typedef size_t zio_compress_func_t(void *src, void *dst,
150 size_t s_len, size_t d_len, int);
151 typedef int zio_decompress_func_t(void *src, void *dst,
152 size_t s_len, size_t d_len, int);
154 extern int gzip_decompress(void *src, void *dst,
155 size_t s_len, size_t d_len, int);
157 * Information about each compression function.
159 typedef struct zio_compress_info {
160 zio_compress_func_t *ci_compress; /* compression function */
161 zio_decompress_func_t *ci_decompress; /* decompression function */
162 int ci_level; /* level parameter */
163 const char *ci_name; /* algorithm name */
164 } zio_compress_info_t;
166 #include "lzjb.c"
167 #include "zle.c"
168 #include "lz4.c"
171 * Compression vectors.
173 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
174 {NULL, NULL, 0, "inherit"},
175 {NULL, NULL, 0, "on"},
176 {NULL, NULL, 0, "uncompressed"},
177 {NULL, lzjb_decompress, 0, "lzjb"},
178 {NULL, NULL, 0, "empty"},
179 {NULL, gzip_decompress, 1, "gzip-1"},
180 {NULL, gzip_decompress, 2, "gzip-2"},
181 {NULL, gzip_decompress, 3, "gzip-3"},
182 {NULL, gzip_decompress, 4, "gzip-4"},
183 {NULL, gzip_decompress, 5, "gzip-5"},
184 {NULL, gzip_decompress, 6, "gzip-6"},
185 {NULL, gzip_decompress, 7, "gzip-7"},
186 {NULL, gzip_decompress, 8, "gzip-8"},
187 {NULL, gzip_decompress, 9, "gzip-9"},
188 {NULL, zle_decompress, 64, "zle"},
189 {NULL, lz4_decompress, 0, "lz4"},
192 static void
193 byteswap_uint64_array(void *vbuf, size_t size)
195 uint64_t *buf = vbuf;
196 size_t count = size >> 3;
197 int i;
199 ASSERT((size & 7) == 0);
201 for (i = 0; i < count; i++)
202 buf[i] = BSWAP_64(buf[i]);
206 * Set the external verifier for a gang block based on <vdev, offset, txg>,
207 * a tuple which is guaranteed to be unique for the life of the pool.
209 static void
210 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
212 const dva_t *dva = BP_IDENTITY(bp);
213 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
215 ASSERT(BP_IS_GANG(bp));
217 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
221 * Set the external verifier for a label block based on its offset.
222 * The vdev is implicit, and the txg is unknowable at pool open time --
223 * hence the logic in vdev_uberblock_load() to find the most recent copy.
225 static void
226 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
228 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
232 * Calls the template init function of a checksum which supports context
233 * templates and installs the template into the spa_t.
235 static void
236 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
238 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
240 if (ci->ci_tmpl_init == NULL)
241 return;
243 if (spa->spa_cksum_tmpls[checksum] != NULL)
244 return;
246 if (spa->spa_cksum_tmpls[checksum] == NULL) {
247 spa->spa_cksum_tmpls[checksum] =
248 ci->ci_tmpl_init(&spa->spa_cksum_salt);
253 * Called by a spa_t that's about to be deallocated. This steps through
254 * all of the checksum context templates and deallocates any that were
255 * initialized using the algorithm-specific template init function.
257 void
258 zio_checksum_templates_free(spa_t *spa)
260 for (enum zio_checksum checksum = 0;
261 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
262 if (spa->spa_cksum_tmpls[checksum] != NULL) {
263 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
265 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
266 spa->spa_cksum_tmpls[checksum] = NULL;
271 static int
272 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
274 uint64_t size;
275 unsigned int checksum;
276 zio_checksum_info_t *ci;
277 void *ctx = NULL;
278 zio_cksum_t actual_cksum, expected_cksum, verifier;
279 int byteswap;
281 checksum = BP_GET_CHECKSUM(bp);
282 size = BP_GET_PSIZE(bp);
284 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
285 return (EINVAL);
286 ci = &zio_checksum_table[checksum];
287 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
288 return (EINVAL);
290 if (spa != NULL) {
291 zio_checksum_template_init(checksum, (spa_t *) spa);
292 ctx = spa->spa_cksum_tmpls[checksum];
295 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
296 zio_eck_t *eck;
298 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
299 checksum == ZIO_CHECKSUM_LABEL);
301 eck = (zio_eck_t *)((char *)data + size) - 1;
303 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
304 zio_checksum_gang_verifier(&verifier, bp);
305 else if (checksum == ZIO_CHECKSUM_LABEL)
306 zio_checksum_label_verifier(&verifier,
307 DVA_GET_OFFSET(BP_IDENTITY(bp)));
308 else
309 verifier = bp->blk_cksum;
311 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
313 if (byteswap)
314 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
316 expected_cksum = eck->zec_cksum;
317 eck->zec_cksum = verifier;
318 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
319 eck->zec_cksum = expected_cksum;
321 if (byteswap)
322 byteswap_uint64_array(&expected_cksum,
323 sizeof (zio_cksum_t));
324 } else {
325 expected_cksum = bp->blk_cksum;
326 ci->ci_func[0](data, size, ctx, &actual_cksum);
329 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
330 /* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
331 return (EIO);
334 return (0);
337 static int
338 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
339 void *dest, uint64_t destsize)
341 zio_compress_info_t *ci;
343 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
344 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
345 return (EIO);
348 ci = &zio_compress_table[cpfunc];
349 if (!ci->ci_decompress) {
350 printf("ZFS: unsupported compression algorithm %s\n",
351 ci->ci_name);
352 return (EIO);
355 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
358 static uint64_t
359 zap_hash(uint64_t salt, const char *name)
361 const uint8_t *cp;
362 uint8_t c;
363 uint64_t crc = salt;
365 ASSERT(crc != 0);
366 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
367 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
368 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
371 * Only use 28 bits, since we need 4 bits in the cookie for the
372 * collision differentiator. We MUST use the high bits, since
373 * those are the onces that we first pay attention to when
374 * chosing the bucket.
376 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
378 return (crc);
381 static void *zfs_alloc(size_t size);
382 static void zfs_free(void *ptr, size_t size);
384 typedef struct raidz_col {
385 uint64_t rc_devidx; /* child device index for I/O */
386 uint64_t rc_offset; /* device offset */
387 uint64_t rc_size; /* I/O size */
388 void *rc_data; /* I/O data */
389 int rc_error; /* I/O error for this device */
390 uint8_t rc_tried; /* Did we attempt this I/O column? */
391 uint8_t rc_skipped; /* Did we skip this I/O column? */
392 } raidz_col_t;
394 typedef struct raidz_map {
395 uint64_t rm_cols; /* Regular column count */
396 uint64_t rm_scols; /* Count including skipped columns */
397 uint64_t rm_bigcols; /* Number of oversized columns */
398 uint64_t rm_asize; /* Actual total I/O size */
399 uint64_t rm_missingdata; /* Count of missing data devices */
400 uint64_t rm_missingparity; /* Count of missing parity devices */
401 uint64_t rm_firstdatacol; /* First data column/parity count */
402 uint64_t rm_nskip; /* Skipped sectors for padding */
403 uint64_t rm_skipstart; /* Column index of padding start */
404 uintptr_t rm_reports; /* # of referencing checksum reports */
405 uint8_t rm_freed; /* map no longer has referencing ZIO */
406 uint8_t rm_ecksuminjected; /* checksum error was injected */
407 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
408 } raidz_map_t;
410 #define VDEV_RAIDZ_P 0
411 #define VDEV_RAIDZ_Q 1
412 #define VDEV_RAIDZ_R 2
414 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
415 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
418 * We provide a mechanism to perform the field multiplication operation on a
419 * 64-bit value all at once rather than a byte at a time. This works by
420 * creating a mask from the top bit in each byte and using that to
421 * conditionally apply the XOR of 0x1d.
423 #define VDEV_RAIDZ_64MUL_2(x, mask) \
425 (mask) = (x) & 0x8080808080808080ULL; \
426 (mask) = ((mask) << 1) - ((mask) >> 7); \
427 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
428 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
431 #define VDEV_RAIDZ_64MUL_4(x, mask) \
433 VDEV_RAIDZ_64MUL_2((x), mask); \
434 VDEV_RAIDZ_64MUL_2((x), mask); \
438 * These two tables represent powers and logs of 2 in the Galois field defined
439 * above. These values were computed by repeatedly multiplying by 2 as above.
441 static const uint8_t vdev_raidz_pow2[256] = {
442 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
443 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
444 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
445 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
446 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
447 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
448 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
449 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
450 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
451 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
452 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
453 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
454 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
455 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
456 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
457 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
458 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
459 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
460 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
461 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
462 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
463 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
464 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
465 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
466 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
467 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
468 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
469 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
470 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
471 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
472 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
473 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
475 static const uint8_t vdev_raidz_log2[256] = {
476 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
477 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
478 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
479 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
480 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
481 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
482 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
483 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
484 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
485 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
486 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
487 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
488 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
489 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
490 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
491 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
492 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
493 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
494 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
495 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
496 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
497 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
498 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
499 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
500 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
501 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
502 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
503 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
504 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
505 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
506 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
507 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
511 * Multiply a given number by 2 raised to the given power.
513 static uint8_t
514 vdev_raidz_exp2(uint8_t a, int exp)
516 if (a == 0)
517 return (0);
519 ASSERT(exp >= 0);
520 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
522 exp += vdev_raidz_log2[a];
523 if (exp > 255)
524 exp -= 255;
526 return (vdev_raidz_pow2[exp]);
529 static void
530 vdev_raidz_generate_parity_p(raidz_map_t *rm)
532 uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
533 int c;
535 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
537 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
538 src = rm->rm_col[c].rc_data;
539 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
540 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
542 if (c == rm->rm_firstdatacol) {
543 ASSERT(ccount == pcount);
544 for (i = 0; i < ccount; i++, src++, p++) {
545 *p = *src;
547 } else {
548 ASSERT(ccount <= pcount);
549 for (i = 0; i < ccount; i++, src++, p++) {
550 *p ^= *src;
556 static void
557 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
559 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
560 int c;
562 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
563 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
564 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
566 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
567 src = rm->rm_col[c].rc_data;
568 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
569 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
571 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
573 if (c == rm->rm_firstdatacol) {
574 ASSERT(ccnt == pcnt || ccnt == 0);
575 for (i = 0; i < ccnt; i++, src++, p++, q++) {
576 *p = *src;
577 *q = *src;
579 for (; i < pcnt; i++, src++, p++, q++) {
580 *p = 0;
581 *q = 0;
583 } else {
584 ASSERT(ccnt <= pcnt);
587 * Apply the algorithm described above by multiplying
588 * the previous result and adding in the new value.
590 for (i = 0; i < ccnt; i++, src++, p++, q++) {
591 *p ^= *src;
593 VDEV_RAIDZ_64MUL_2(*q, mask);
594 *q ^= *src;
598 * Treat short columns as though they are full of 0s.
599 * Note that there's therefore nothing needed for P.
601 for (; i < pcnt; i++, q++) {
602 VDEV_RAIDZ_64MUL_2(*q, mask);
608 static void
609 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
611 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
612 int c;
614 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
615 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
616 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
617 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
618 rm->rm_col[VDEV_RAIDZ_R].rc_size);
620 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
621 src = rm->rm_col[c].rc_data;
622 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
623 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
624 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
626 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
628 if (c == rm->rm_firstdatacol) {
629 ASSERT(ccnt == pcnt || ccnt == 0);
630 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
631 *p = *src;
632 *q = *src;
633 *r = *src;
635 for (; i < pcnt; i++, src++, p++, q++, r++) {
636 *p = 0;
637 *q = 0;
638 *r = 0;
640 } else {
641 ASSERT(ccnt <= pcnt);
644 * Apply the algorithm described above by multiplying
645 * the previous result and adding in the new value.
647 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
648 *p ^= *src;
650 VDEV_RAIDZ_64MUL_2(*q, mask);
651 *q ^= *src;
653 VDEV_RAIDZ_64MUL_4(*r, mask);
654 *r ^= *src;
658 * Treat short columns as though they are full of 0s.
659 * Note that there's therefore nothing needed for P.
661 for (; i < pcnt; i++, q++, r++) {
662 VDEV_RAIDZ_64MUL_2(*q, mask);
663 VDEV_RAIDZ_64MUL_4(*r, mask);
670 * Generate RAID parity in the first virtual columns according to the number of
671 * parity columns available.
673 static void
674 vdev_raidz_generate_parity(raidz_map_t *rm)
676 switch (rm->rm_firstdatacol) {
677 case 1:
678 vdev_raidz_generate_parity_p(rm);
679 break;
680 case 2:
681 vdev_raidz_generate_parity_pq(rm);
682 break;
683 case 3:
684 vdev_raidz_generate_parity_pqr(rm);
685 break;
686 default:
687 panic("invalid RAID-Z configuration");
691 /* BEGIN CSTYLED */
693 * In the general case of reconstruction, we must solve the system of linear
694 * equations defined by the coeffecients used to generate parity as well as
695 * the contents of the data and parity disks. This can be expressed with
696 * vectors for the original data (D) and the actual data (d) and parity (p)
697 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
699 * __ __ __ __
700 * | | __ __ | p_0 |
701 * | V | | D_0 | | p_m-1 |
702 * | | x | : | = | d_0 |
703 * | I | | D_n-1 | | : |
704 * | | ~~ ~~ | d_n-1 |
705 * ~~ ~~ ~~ ~~
707 * I is simply a square identity matrix of size n, and V is a vandermonde
708 * matrix defined by the coeffecients we chose for the various parity columns
709 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
710 * computation as well as linear separability.
712 * __ __ __ __
713 * | 1 .. 1 1 1 | | p_0 |
714 * | 2^n-1 .. 4 2 1 | __ __ | : |
715 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
716 * | 1 .. 0 0 0 | | D_1 | | d_0 |
717 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
718 * | : : : : | | : | | d_2 |
719 * | 0 .. 1 0 0 | | D_n-1 | | : |
720 * | 0 .. 0 1 0 | ~~ ~~ | : |
721 * | 0 .. 0 0 1 | | d_n-1 |
722 * ~~ ~~ ~~ ~~
724 * Note that I, V, d, and p are known. To compute D, we must invert the
725 * matrix and use the known data and parity values to reconstruct the unknown
726 * data values. We begin by removing the rows in V|I and d|p that correspond
727 * to failed or missing columns; we then make V|I square (n x n) and d|p
728 * sized n by removing rows corresponding to unused parity from the bottom up
729 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
730 * using Gauss-Jordan elimination. In the example below we use m=3 parity
731 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
732 * __ __
733 * | 1 1 1 1 1 1 1 1 |
734 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
735 * | 19 205 116 29 64 16 4 1 | / /
736 * | 1 0 0 0 0 0 0 0 | / /
737 * | 0 1 0 0 0 0 0 0 | <--' /
738 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
739 * | 0 0 0 1 0 0 0 0 |
740 * | 0 0 0 0 1 0 0 0 |
741 * | 0 0 0 0 0 1 0 0 |
742 * | 0 0 0 0 0 0 1 0 |
743 * | 0 0 0 0 0 0 0 1 |
744 * ~~ ~~
745 * __ __
746 * | 1 1 1 1 1 1 1 1 |
747 * | 128 64 32 16 8 4 2 1 |
748 * | 19 205 116 29 64 16 4 1 |
749 * | 1 0 0 0 0 0 0 0 |
750 * | 0 1 0 0 0 0 0 0 |
751 * (V|I)' = | 0 0 1 0 0 0 0 0 |
752 * | 0 0 0 1 0 0 0 0 |
753 * | 0 0 0 0 1 0 0 0 |
754 * | 0 0 0 0 0 1 0 0 |
755 * | 0 0 0 0 0 0 1 0 |
756 * | 0 0 0 0 0 0 0 1 |
757 * ~~ ~~
759 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
760 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
761 * matrix is not singular.
762 * __ __
763 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
764 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
765 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
766 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
767 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
768 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
769 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
770 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
771 * ~~ ~~
772 * __ __
773 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
774 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
775 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
776 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
777 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
778 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
779 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
780 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
781 * ~~ ~~
782 * __ __
783 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
784 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
785 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
786 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
787 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
788 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
789 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
790 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
791 * ~~ ~~
792 * __ __
793 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
794 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
795 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
796 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
797 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
798 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
799 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
800 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
801 * ~~ ~~
802 * __ __
803 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
804 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
805 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
806 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
807 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
808 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
809 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
810 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
811 * ~~ ~~
812 * __ __
813 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
814 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
815 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
816 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
817 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
818 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
819 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
820 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
821 * ~~ ~~
822 * __ __
823 * | 0 0 1 0 0 0 0 0 |
824 * | 167 100 5 41 159 169 217 208 |
825 * | 166 100 4 40 158 168 216 209 |
826 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
827 * | 0 0 0 0 1 0 0 0 |
828 * | 0 0 0 0 0 1 0 0 |
829 * | 0 0 0 0 0 0 1 0 |
830 * | 0 0 0 0 0 0 0 1 |
831 * ~~ ~~
833 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
834 * of the missing data.
836 * As is apparent from the example above, the only non-trivial rows in the
837 * inverse matrix correspond to the data disks that we're trying to
838 * reconstruct. Indeed, those are the only rows we need as the others would
839 * only be useful for reconstructing data known or assumed to be valid. For
840 * that reason, we only build the coefficients in the rows that correspond to
841 * targeted columns.
843 /* END CSTYLED */
845 static void
846 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
847 uint8_t **rows)
849 int i, j;
850 int pow;
852 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
855 * Fill in the missing rows of interest.
857 for (i = 0; i < nmap; i++) {
858 ASSERT3S(0, <=, map[i]);
859 ASSERT3S(map[i], <=, 2);
861 pow = map[i] * n;
862 if (pow > 255)
863 pow -= 255;
864 ASSERT(pow <= 255);
866 for (j = 0; j < n; j++) {
867 pow -= map[i];
868 if (pow < 0)
869 pow += 255;
870 rows[i][j] = vdev_raidz_pow2[pow];
875 static void
876 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
877 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
879 int i, j, ii, jj;
880 uint8_t log;
883 * Assert that the first nmissing entries from the array of used
884 * columns correspond to parity columns and that subsequent entries
885 * correspond to data columns.
887 for (i = 0; i < nmissing; i++) {
888 ASSERT3S(used[i], <, rm->rm_firstdatacol);
890 for (; i < n; i++) {
891 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
895 * First initialize the storage where we'll compute the inverse rows.
897 for (i = 0; i < nmissing; i++) {
898 for (j = 0; j < n; j++) {
899 invrows[i][j] = (i == j) ? 1 : 0;
904 * Subtract all trivial rows from the rows of consequence.
906 for (i = 0; i < nmissing; i++) {
907 for (j = nmissing; j < n; j++) {
908 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
909 jj = used[j] - rm->rm_firstdatacol;
910 ASSERT3S(jj, <, n);
911 invrows[i][j] = rows[i][jj];
912 rows[i][jj] = 0;
917 * For each of the rows of interest, we must normalize it and subtract
918 * a multiple of it from the other rows.
920 for (i = 0; i < nmissing; i++) {
921 for (j = 0; j < missing[i]; j++) {
922 ASSERT3U(rows[i][j], ==, 0);
924 ASSERT3U(rows[i][missing[i]], !=, 0);
927 * Compute the inverse of the first element and multiply each
928 * element in the row by that value.
930 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
932 for (j = 0; j < n; j++) {
933 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
934 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
937 for (ii = 0; ii < nmissing; ii++) {
938 if (i == ii)
939 continue;
941 ASSERT3U(rows[ii][missing[i]], !=, 0);
943 log = vdev_raidz_log2[rows[ii][missing[i]]];
945 for (j = 0; j < n; j++) {
946 rows[ii][j] ^=
947 vdev_raidz_exp2(rows[i][j], log);
948 invrows[ii][j] ^=
949 vdev_raidz_exp2(invrows[i][j], log);
955 * Verify that the data that is left in the rows are properly part of
956 * an identity matrix.
958 for (i = 0; i < nmissing; i++) {
959 for (j = 0; j < n; j++) {
960 if (j == missing[i]) {
961 ASSERT3U(rows[i][j], ==, 1);
962 } else {
963 ASSERT3U(rows[i][j], ==, 0);
969 static void
970 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
971 int *missing, uint8_t **invrows, const uint8_t *used)
973 int i, j, x, cc, c;
974 uint8_t *src;
975 uint64_t ccount;
976 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
977 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
978 uint8_t log, val;
979 int ll;
980 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
981 uint8_t *p, *pp;
982 size_t psize;
984 log = 0; /* gcc */
985 psize = sizeof (invlog[0][0]) * n * nmissing;
986 p = zfs_alloc(psize);
988 for (pp = p, i = 0; i < nmissing; i++) {
989 invlog[i] = pp;
990 pp += n;
993 for (i = 0; i < nmissing; i++) {
994 for (j = 0; j < n; j++) {
995 ASSERT3U(invrows[i][j], !=, 0);
996 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1000 for (i = 0; i < n; i++) {
1001 c = used[i];
1002 ASSERT3U(c, <, rm->rm_cols);
1004 src = rm->rm_col[c].rc_data;
1005 ccount = rm->rm_col[c].rc_size;
1006 for (j = 0; j < nmissing; j++) {
1007 cc = missing[j] + rm->rm_firstdatacol;
1008 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1009 ASSERT3U(cc, <, rm->rm_cols);
1010 ASSERT3U(cc, !=, c);
1012 dst[j] = rm->rm_col[cc].rc_data;
1013 dcount[j] = rm->rm_col[cc].rc_size;
1016 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1018 for (x = 0; x < ccount; x++, src++) {
1019 if (*src != 0)
1020 log = vdev_raidz_log2[*src];
1022 for (cc = 0; cc < nmissing; cc++) {
1023 if (x >= dcount[cc])
1024 continue;
1026 if (*src == 0) {
1027 val = 0;
1028 } else {
1029 if ((ll = log + invlog[cc][i]) >= 255)
1030 ll -= 255;
1031 val = vdev_raidz_pow2[ll];
1034 if (i == 0)
1035 dst[cc][x] = val;
1036 else
1037 dst[cc][x] ^= val;
1042 zfs_free(p, psize);
1045 static int
1046 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1048 int n, i, c, t, tt;
1049 int nmissing_rows;
1050 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1051 int parity_map[VDEV_RAIDZ_MAXPARITY];
1053 uint8_t *p, *pp;
1054 size_t psize;
1056 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1057 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1058 uint8_t *used;
1060 int code = 0;
1063 n = rm->rm_cols - rm->rm_firstdatacol;
1066 * Figure out which data columns are missing.
1068 nmissing_rows = 0;
1069 for (t = 0; t < ntgts; t++) {
1070 if (tgts[t] >= rm->rm_firstdatacol) {
1071 missing_rows[nmissing_rows++] =
1072 tgts[t] - rm->rm_firstdatacol;
1077 * Figure out which parity columns to use to help generate the missing
1078 * data columns.
1080 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1081 ASSERT(tt < ntgts);
1082 ASSERT(c < rm->rm_firstdatacol);
1085 * Skip any targeted parity columns.
1087 if (c == tgts[tt]) {
1088 tt++;
1089 continue;
1092 code |= 1 << c;
1094 parity_map[i] = c;
1095 i++;
1098 ASSERT(code != 0);
1099 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1101 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1102 nmissing_rows * n + sizeof (used[0]) * n;
1103 p = kmem_alloc(psize, KM_SLEEP);
1105 for (pp = p, i = 0; i < nmissing_rows; i++) {
1106 rows[i] = pp;
1107 pp += n;
1108 invrows[i] = pp;
1109 pp += n;
1111 used = pp;
1113 for (i = 0; i < nmissing_rows; i++) {
1114 used[i] = parity_map[i];
1117 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1118 if (tt < nmissing_rows &&
1119 c == missing_rows[tt] + rm->rm_firstdatacol) {
1120 tt++;
1121 continue;
1124 ASSERT3S(i, <, n);
1125 used[i] = c;
1126 i++;
1130 * Initialize the interesting rows of the matrix.
1132 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1135 * Invert the matrix.
1137 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1138 invrows, used);
1141 * Reconstruct the missing data using the generated matrix.
1143 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1144 invrows, used);
1146 kmem_free(p, psize);
1148 return (code);
1151 static int
1152 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1154 int tgts[VDEV_RAIDZ_MAXPARITY];
1155 int ntgts;
1156 int i, c;
1157 int code;
1158 int nbadparity, nbaddata;
1161 * The tgts list must already be sorted.
1163 for (i = 1; i < nt; i++) {
1164 ASSERT(t[i] > t[i - 1]);
1167 nbadparity = rm->rm_firstdatacol;
1168 nbaddata = rm->rm_cols - nbadparity;
1169 ntgts = 0;
1170 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1171 if (i < nt && c == t[i]) {
1172 tgts[ntgts++] = c;
1173 i++;
1174 } else if (rm->rm_col[c].rc_error != 0) {
1175 tgts[ntgts++] = c;
1176 } else if (c >= rm->rm_firstdatacol) {
1177 nbaddata--;
1178 } else {
1179 nbadparity--;
1183 ASSERT(ntgts >= nt);
1184 ASSERT(nbaddata >= 0);
1185 ASSERT(nbaddata + nbadparity == ntgts);
1187 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1188 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1189 ASSERT(code > 0);
1190 return (code);
1193 static raidz_map_t *
1194 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1195 uint64_t dcols, uint64_t nparity)
1197 raidz_map_t *rm;
1198 uint64_t b = offset >> unit_shift;
1199 uint64_t s = size >> unit_shift;
1200 uint64_t f = b % dcols;
1201 uint64_t o = (b / dcols) << unit_shift;
1202 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1204 q = s / (dcols - nparity);
1205 r = s - q * (dcols - nparity);
1206 bc = (r == 0 ? 0 : r + nparity);
1207 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1209 if (q == 0) {
1210 acols = bc;
1211 scols = MIN(dcols, roundup(bc, nparity + 1));
1212 } else {
1213 acols = dcols;
1214 scols = dcols;
1217 ASSERT3U(acols, <=, scols);
1219 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1221 rm->rm_cols = acols;
1222 rm->rm_scols = scols;
1223 rm->rm_bigcols = bc;
1224 rm->rm_skipstart = bc;
1225 rm->rm_missingdata = 0;
1226 rm->rm_missingparity = 0;
1227 rm->rm_firstdatacol = nparity;
1228 rm->rm_reports = 0;
1229 rm->rm_freed = 0;
1230 rm->rm_ecksuminjected = 0;
1232 asize = 0;
1234 for (c = 0; c < scols; c++) {
1235 col = f + c;
1236 coff = o;
1237 if (col >= dcols) {
1238 col -= dcols;
1239 coff += 1ULL << unit_shift;
1241 rm->rm_col[c].rc_devidx = col;
1242 rm->rm_col[c].rc_offset = coff;
1243 rm->rm_col[c].rc_data = NULL;
1244 rm->rm_col[c].rc_error = 0;
1245 rm->rm_col[c].rc_tried = 0;
1246 rm->rm_col[c].rc_skipped = 0;
1248 if (c >= acols)
1249 rm->rm_col[c].rc_size = 0;
1250 else if (c < bc)
1251 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1252 else
1253 rm->rm_col[c].rc_size = q << unit_shift;
1255 asize += rm->rm_col[c].rc_size;
1258 ASSERT3U(asize, ==, tot << unit_shift);
1259 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1260 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1261 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1262 ASSERT3U(rm->rm_nskip, <=, nparity);
1264 for (c = 0; c < rm->rm_firstdatacol; c++)
1265 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1267 rm->rm_col[c].rc_data = data;
1269 for (c = c + 1; c < acols; c++)
1270 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1271 rm->rm_col[c - 1].rc_size;
1274 * If all data stored spans all columns, there's a danger that parity
1275 * will always be on the same device and, since parity isn't read
1276 * during normal operation, that that device's I/O bandwidth won't be
1277 * used effectively. We therefore switch the parity every 1MB.
1279 * ... at least that was, ostensibly, the theory. As a practical
1280 * matter unless we juggle the parity between all devices evenly, we
1281 * won't see any benefit. Further, occasional writes that aren't a
1282 * multiple of the LCM of the number of children and the minimum
1283 * stripe width are sufficient to avoid pessimal behavior.
1284 * Unfortunately, this decision created an implicit on-disk format
1285 * requirement that we need to support for all eternity, but only
1286 * for single-parity RAID-Z.
1288 * If we intend to skip a sector in the zeroth column for padding
1289 * we must make sure to note this swap. We will never intend to
1290 * skip the first column since at least one data and one parity
1291 * column must appear in each row.
1293 ASSERT(rm->rm_cols >= 2);
1294 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1296 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1297 devidx = rm->rm_col[0].rc_devidx;
1298 o = rm->rm_col[0].rc_offset;
1299 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1300 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1301 rm->rm_col[1].rc_devidx = devidx;
1302 rm->rm_col[1].rc_offset = o;
1304 if (rm->rm_skipstart == 0)
1305 rm->rm_skipstart = 1;
1308 return (rm);
1311 static void
1312 vdev_raidz_map_free(raidz_map_t *rm)
1314 int c;
1316 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1317 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1319 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1322 static vdev_t *
1323 vdev_child(vdev_t *pvd, uint64_t devidx)
1325 vdev_t *cvd;
1327 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1328 if (cvd->v_id == devidx)
1329 break;
1332 return (cvd);
1336 * We keep track of whether or not there were any injected errors, so that
1337 * any ereports we generate can note it.
1339 static int
1340 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1341 uint64_t size)
1344 return (zio_checksum_verify(spa, bp, data));
1348 * Generate the parity from the data columns. If we tried and were able to
1349 * read the parity without error, verify that the generated parity matches the
1350 * data we read. If it doesn't, we fire off a checksum error. Return the
1351 * number such failures.
1353 static int
1354 raidz_parity_verify(raidz_map_t *rm)
1356 void *orig[VDEV_RAIDZ_MAXPARITY];
1357 int c, ret = 0;
1358 raidz_col_t *rc;
1360 for (c = 0; c < rm->rm_firstdatacol; c++) {
1361 rc = &rm->rm_col[c];
1362 if (!rc->rc_tried || rc->rc_error != 0)
1363 continue;
1364 orig[c] = zfs_alloc(rc->rc_size);
1365 bcopy(rc->rc_data, orig[c], rc->rc_size);
1368 vdev_raidz_generate_parity(rm);
1370 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1371 rc = &rm->rm_col[c];
1372 if (!rc->rc_tried || rc->rc_error != 0)
1373 continue;
1374 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1375 rc->rc_error = ECKSUM;
1376 ret++;
1378 zfs_free(orig[c], rc->rc_size);
1381 return (ret);
1385 * Iterate over all combinations of bad data and attempt a reconstruction.
1386 * Note that the algorithm below is non-optimal because it doesn't take into
1387 * account how reconstruction is actually performed. For example, with
1388 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1389 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1390 * cases we'd only use parity information in column 0.
1392 static int
1393 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1394 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1396 raidz_col_t *rc;
1397 void *orig[VDEV_RAIDZ_MAXPARITY];
1398 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1399 int *tgts = &tstore[1];
1400 int current, next, i, c, n;
1401 int code, ret = 0;
1403 ASSERT(total_errors < rm->rm_firstdatacol);
1406 * This simplifies one edge condition.
1408 tgts[-1] = -1;
1410 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1412 * Initialize the targets array by finding the first n columns
1413 * that contain no error.
1415 * If there were no data errors, we need to ensure that we're
1416 * always explicitly attempting to reconstruct at least one
1417 * data column. To do this, we simply push the highest target
1418 * up into the data columns.
1420 for (c = 0, i = 0; i < n; i++) {
1421 if (i == n - 1 && data_errors == 0 &&
1422 c < rm->rm_firstdatacol) {
1423 c = rm->rm_firstdatacol;
1426 while (rm->rm_col[c].rc_error != 0) {
1427 c++;
1428 ASSERT3S(c, <, rm->rm_cols);
1431 tgts[i] = c++;
1435 * Setting tgts[n] simplifies the other edge condition.
1437 tgts[n] = rm->rm_cols;
1440 * These buffers were allocated in previous iterations.
1442 for (i = 0; i < n - 1; i++) {
1443 ASSERT(orig[i] != NULL);
1446 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1448 current = 0;
1449 next = tgts[current];
1451 while (current != n) {
1452 tgts[current] = next;
1453 current = 0;
1456 * Save off the original data that we're going to
1457 * attempt to reconstruct.
1459 for (i = 0; i < n; i++) {
1460 ASSERT(orig[i] != NULL);
1461 c = tgts[i];
1462 ASSERT3S(c, >=, 0);
1463 ASSERT3S(c, <, rm->rm_cols);
1464 rc = &rm->rm_col[c];
1465 bcopy(rc->rc_data, orig[i], rc->rc_size);
1469 * Attempt a reconstruction and exit the outer loop on
1470 * success.
1472 code = vdev_raidz_reconstruct(rm, tgts, n);
1473 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1474 for (i = 0; i < n; i++) {
1475 c = tgts[i];
1476 rc = &rm->rm_col[c];
1477 ASSERT(rc->rc_error == 0);
1478 rc->rc_error = ECKSUM;
1481 ret = code;
1482 goto done;
1486 * Restore the original data.
1488 for (i = 0; i < n; i++) {
1489 c = tgts[i];
1490 rc = &rm->rm_col[c];
1491 bcopy(orig[i], rc->rc_data, rc->rc_size);
1494 do {
1496 * Find the next valid column after the current
1497 * position..
1499 for (next = tgts[current] + 1;
1500 next < rm->rm_cols &&
1501 rm->rm_col[next].rc_error != 0; next++)
1502 continue;
1504 ASSERT(next <= tgts[current + 1]);
1507 * If that spot is available, we're done here.
1509 if (next != tgts[current + 1])
1510 break;
1513 * Otherwise, find the next valid column after
1514 * the previous position.
1516 for (c = tgts[current - 1] + 1;
1517 rm->rm_col[c].rc_error != 0; c++)
1518 continue;
1520 tgts[current] = c;
1521 current++;
1523 } while (current != n);
1526 n--;
1527 done:
1528 for (i = n - 1; i >= 0; i--) {
1529 zfs_free(orig[i], rm->rm_col[0].rc_size);
1532 return (ret);
1535 static int
1536 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1537 off_t offset, size_t bytes)
1539 vdev_t *tvd = vd->v_top;
1540 vdev_t *cvd;
1541 raidz_map_t *rm;
1542 raidz_col_t *rc;
1543 int c, error;
1544 int unexpected_errors;
1545 int parity_errors;
1546 int parity_untried;
1547 int data_errors;
1548 int total_errors;
1549 int n;
1550 int tgts[VDEV_RAIDZ_MAXPARITY];
1551 int code;
1553 rc = NULL; /* gcc */
1554 error = 0;
1556 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1557 vd->v_nchildren, vd->v_nparity);
1560 * Iterate over the columns in reverse order so that we hit the parity
1561 * last -- any errors along the way will force us to read the parity.
1563 for (c = rm->rm_cols - 1; c >= 0; c--) {
1564 rc = &rm->rm_col[c];
1565 cvd = vdev_child(vd, rc->rc_devidx);
1566 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1567 if (c >= rm->rm_firstdatacol)
1568 rm->rm_missingdata++;
1569 else
1570 rm->rm_missingparity++;
1571 rc->rc_error = ENXIO;
1572 rc->rc_tried = 1; /* don't even try */
1573 rc->rc_skipped = 1;
1574 continue;
1576 #if 0 /* XXX: Too hard for the boot code. */
1577 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1578 if (c >= rm->rm_firstdatacol)
1579 rm->rm_missingdata++;
1580 else
1581 rm->rm_missingparity++;
1582 rc->rc_error = ESTALE;
1583 rc->rc_skipped = 1;
1584 continue;
1586 #endif
1587 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1588 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1589 rc->rc_offset, rc->rc_size);
1590 rc->rc_tried = 1;
1591 rc->rc_skipped = 0;
1595 reconstruct:
1596 unexpected_errors = 0;
1597 parity_errors = 0;
1598 parity_untried = 0;
1599 data_errors = 0;
1600 total_errors = 0;
1602 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1603 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1605 for (c = 0; c < rm->rm_cols; c++) {
1606 rc = &rm->rm_col[c];
1608 if (rc->rc_error) {
1609 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1611 if (c < rm->rm_firstdatacol)
1612 parity_errors++;
1613 else
1614 data_errors++;
1616 if (!rc->rc_skipped)
1617 unexpected_errors++;
1619 total_errors++;
1620 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1621 parity_untried++;
1626 * There are three potential phases for a read:
1627 * 1. produce valid data from the columns read
1628 * 2. read all disks and try again
1629 * 3. perform combinatorial reconstruction
1631 * Each phase is progressively both more expensive and less likely to
1632 * occur. If we encounter more errors than we can repair or all phases
1633 * fail, we have no choice but to return an error.
1637 * If the number of errors we saw was correctable -- less than or equal
1638 * to the number of parity disks read -- attempt to produce data that
1639 * has a valid checksum. Naturally, this case applies in the absence of
1640 * any errors.
1642 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1643 if (data_errors == 0) {
1644 if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1646 * If we read parity information (unnecessarily
1647 * as it happens since no reconstruction was
1648 * needed) regenerate and verify the parity.
1649 * We also regenerate parity when resilvering
1650 * so we can write it out to the failed device
1651 * later.
1653 if (parity_errors + parity_untried <
1654 rm->rm_firstdatacol) {
1655 n = raidz_parity_verify(rm);
1656 unexpected_errors += n;
1657 ASSERT(parity_errors + n <=
1658 rm->rm_firstdatacol);
1660 goto done;
1662 } else {
1664 * We either attempt to read all the parity columns or
1665 * none of them. If we didn't try to read parity, we
1666 * wouldn't be here in the correctable case. There must
1667 * also have been fewer parity errors than parity
1668 * columns or, again, we wouldn't be in this code path.
1670 ASSERT(parity_untried == 0);
1671 ASSERT(parity_errors < rm->rm_firstdatacol);
1674 * Identify the data columns that reported an error.
1676 n = 0;
1677 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1678 rc = &rm->rm_col[c];
1679 if (rc->rc_error != 0) {
1680 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1681 tgts[n++] = c;
1685 ASSERT(rm->rm_firstdatacol >= n);
1687 code = vdev_raidz_reconstruct(rm, tgts, n);
1689 if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1691 * If we read more parity disks than were used
1692 * for reconstruction, confirm that the other
1693 * parity disks produced correct data. This
1694 * routine is suboptimal in that it regenerates
1695 * the parity that we already used in addition
1696 * to the parity that we're attempting to
1697 * verify, but this should be a relatively
1698 * uncommon case, and can be optimized if it
1699 * becomes a problem. Note that we regenerate
1700 * parity when resilvering so we can write it
1701 * out to failed devices later.
1703 if (parity_errors < rm->rm_firstdatacol - n) {
1704 n = raidz_parity_verify(rm);
1705 unexpected_errors += n;
1706 ASSERT(parity_errors + n <=
1707 rm->rm_firstdatacol);
1710 goto done;
1716 * This isn't a typical situation -- either we got a read
1717 * error or a child silently returned bad data. Read every
1718 * block so we can try again with as much data and parity as
1719 * we can track down. If we've already been through once
1720 * before, all children will be marked as tried so we'll
1721 * proceed to combinatorial reconstruction.
1723 unexpected_errors = 1;
1724 rm->rm_missingdata = 0;
1725 rm->rm_missingparity = 0;
1727 n = 0;
1728 for (c = 0; c < rm->rm_cols; c++) {
1729 rc = &rm->rm_col[c];
1731 if (rc->rc_tried)
1732 continue;
1734 cvd = vdev_child(vd, rc->rc_devidx);
1735 ASSERT(cvd != NULL);
1736 rc->rc_error = cvd->v_read(cvd, NULL,
1737 rc->rc_data, rc->rc_offset, rc->rc_size);
1738 if (rc->rc_error == 0)
1739 n++;
1740 rc->rc_tried = 1;
1741 rc->rc_skipped = 0;
1744 * If we managed to read anything more, retry the
1745 * reconstruction.
1747 if (n > 0)
1748 goto reconstruct;
1751 * At this point we've attempted to reconstruct the data given the
1752 * errors we detected, and we've attempted to read all columns. There
1753 * must, therefore, be one or more additional problems -- silent errors
1754 * resulting in invalid data rather than explicit I/O errors resulting
1755 * in absent data. We check if there is enough additional data to
1756 * possibly reconstruct the data and then perform combinatorial
1757 * reconstruction over all possible combinations. If that fails,
1758 * we're cooked.
1760 if (total_errors > rm->rm_firstdatacol) {
1761 error = EIO;
1762 } else if (total_errors < rm->rm_firstdatacol &&
1763 (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1764 total_errors, data_errors)) != 0) {
1766 * If we didn't use all the available parity for the
1767 * combinatorial reconstruction, verify that the remaining
1768 * parity is correct.
1770 if (code != (1 << rm->rm_firstdatacol) - 1)
1771 (void) raidz_parity_verify(rm);
1772 } else {
1774 * We're here because either:
1776 * total_errors == rm_first_datacol, or
1777 * vdev_raidz_combrec() failed
1779 * In either case, there is enough bad data to prevent
1780 * reconstruction.
1782 * Start checksum ereports for all children which haven't
1783 * failed, and the IO wasn't speculative.
1785 error = ECKSUM;
1788 done:
1789 vdev_raidz_map_free(rm);
1791 return (error);