usr/src/boot/sys/cddl/boot/zfs/zfssubr.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 #include <sys/cdefs.h>
  27
  28 static uint64_t zfs_crc64_table[256];
  29
  30 #define ECKSUM  666
  31
  32 #define ASSERT3S(x, y, z)       ((void)0)
  33 #define ASSERT3U(x, y, z)       ((void)0)
  34 #define ASSERT3P(x, y, z)       ((void)0)
  35 #define ASSERT0(x)              ((void)0)
  36 #define ASSERT(x)               ((void)0)
  37
  38 #define panic(...)      do {                                            \
  39         printf(__VA_ARGS__);                                            \
  40         for (;;) ;                                                      \
  41 } while (0)
  42
  43 #define kmem_alloc(size, flag)  zfs_alloc((size))
  44 #define kmem_free(ptr, size)    zfs_free((ptr), (size))
  45
  46 static void
  47 zfs_init_crc(void)
  48 {
  49         int i, j;
  50         uint64_t *ct;
  51
  52         /*
  53          * Calculate the crc64 table (used for the zap hash
  54          * function).
  55          */
  56         if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
  57                 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
  58                 for (i = 0; i < 256; i++)
  59                         for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
  60                                 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
  61         }
  62 }
  63
  64 static void
  65 zio_checksum_off(const void *buf, uint64_t size,
  66     const void *ctx_template, zio_cksum_t *zcp)
  67 {
  68         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
  69 }
  70
  71 /*
  72  * Signature for checksum functions.
  73  */
  74 typedef void zio_checksum_t(const void *data, uint64_t size,
  75     const void *ctx_template, zio_cksum_t *zcp);
  76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
  77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
  78
  79 typedef enum zio_checksum_flags {
  80         /* Strong enough for metadata? */
  81         ZCHECKSUM_FLAG_METADATA = (1 << 1),
  82         /* ZIO embedded checksum */
  83         ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
  84         /* Strong enough for dedup (without verification)? */
  85         ZCHECKSUM_FLAG_DEDUP = (1 << 3),
  86         /* Uses salt value */
  87         ZCHECKSUM_FLAG_SALTED = (1 << 4),
  88         /* Strong enough for nopwrite? */
  89         ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
  90 } zio_checksum_flags_t;
  91
  92 /*
  93  * Information about each checksum function.
  94  */
  95 typedef struct zio_checksum_info {
  96         /* checksum function for each byteorder */
  97         zio_checksum_t                  *ci_func[2];
  98         zio_checksum_tmpl_init_t        *ci_tmpl_init;
  99         zio_checksum_tmpl_free_t        *ci_tmpl_free;
 100         zio_checksum_flags_t            ci_flags;
 101         const char                      *ci_name;       /* descriptive name */
 102 } zio_checksum_info_t;
 103
 104 #include "blkptr.c"
 105
 106 #include "fletcher.c"
 107 #include "sha256.c"
 108 #include "skein_zfs.c"
 109 #include "edonr_zfs.c"
 110
 111 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 112         {{NULL, NULL}, NULL, NULL, 0, "inherit"},
 113         {{NULL, NULL}, NULL, NULL, 0, "on"},
 114         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL, 0, "off"},
 115         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 116             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
 117         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 118             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
 119         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 120             ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
 121         {{fletcher_2_native,    fletcher_2_byteswap}, NULL, NULL,
 122             0, "fletcher2"},
 123         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 124             ZCHECKSUM_FLAG_METADATA, "fletcher4"},
 125         {{zio_checksum_SHA256,  zio_checksum_SHA256}, NULL, NULL,
 126             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 127             ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
 128         {{fletcher_4_native,    fletcher_4_byteswap}, NULL, NULL,
 129             ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
 130         {{zio_checksum_off,     zio_checksum_off}, NULL, NULL,
 131             0, "noparity"},
 132         {{zio_checksum_SHA512_native,   zio_checksum_SHA512_byteswap},
 133             NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 134             ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
 135         /* no skein and edonr for now */
 136         {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
 137             zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
 138             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 139             ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
 140         {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
 141             zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
 142             ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
 143             ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
 144 };
 145
 146 /*
 147  * Common signature for all zio compress/decompress functions.
 148  */
 149 typedef size_t zio_compress_func_t(void *src, void *dst,
 150     size_t s_len, size_t d_len, int);
 151 typedef int zio_decompress_func_t(void *src, void *dst,
 152     size_t s_len, size_t d_len, int);
 153
 154 extern int gzip_decompress(void *src, void *dst,
 155     size_t s_len, size_t d_len, int);
 156 /*
 157  * Information about each compression function.
 158  */
 159 typedef struct zio_compress_info {
 160         zio_compress_func_t     *ci_compress;   /* compression function */
 161         zio_decompress_func_t   *ci_decompress; /* decompression function */
 162         int                     ci_level;       /* level parameter */
 163         const char              *ci_name;       /* algorithm name */
 164 } zio_compress_info_t;
 165
 166 #include "lzjb.c"
 167 #include "zle.c"
 168 #include "lz4.c"
 169
 170 /*
 171  * Compression vectors.
 172  */
 173 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 174         {NULL,                  NULL,                   0,      "inherit"},
 175         {NULL,                  NULL,                   0,      "on"},
 176         {NULL,                  NULL,                   0,      "uncompressed"},
 177         {NULL,                  lzjb_decompress,        0,      "lzjb"},
 178         {NULL,                  NULL,                   0,      "empty"},
 179         {NULL,                  gzip_decompress,        1,      "gzip-1"},
 180         {NULL,                  gzip_decompress,        2,      "gzip-2"},
 181         {NULL,                  gzip_decompress,        3,      "gzip-3"},
 182         {NULL,                  gzip_decompress,        4,      "gzip-4"},
 183         {NULL,                  gzip_decompress,        5,      "gzip-5"},
 184         {NULL,                  gzip_decompress,        6,      "gzip-6"},
 185         {NULL,                  gzip_decompress,        7,      "gzip-7"},
 186         {NULL,                  gzip_decompress,        8,      "gzip-8"},
 187         {NULL,                  gzip_decompress,        9,      "gzip-9"},
 188         {NULL,                  zle_decompress,         64,     "zle"},
 189         {NULL,                  lz4_decompress,         0,      "lz4"},
 190 };
 191
 192 static void
 193 byteswap_uint64_array(void *vbuf, size_t size)
 194 {
 195         uint64_t *buf = vbuf;
 196         size_t count = size >> 3;
 197         int i;
 198
 199         ASSERT((size & 7) == 0);
 200
 201         for (i = 0; i < count; i++)
 202                 buf[i] = BSWAP_64(buf[i]);
 203 }
 204
 205 /*
 206  * Set the external verifier for a gang block based on <vdev, offset, txg>,
 207  * a tuple which is guaranteed to be unique for the life of the pool.
 208  */
 209 static void
 210 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 211 {
 212         const dva_t *dva = BP_IDENTITY(bp);
 213         uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 214
 215         ASSERT(BP_IS_GANG(bp));
 216
 217         ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 218 }
 219
 220 /*
 221  * Set the external verifier for a label block based on its offset.
 222  * The vdev is implicit, and the txg is unknowable at pool open time --
 223  * hence the logic in vdev_uberblock_load() to find the most recent copy.
 224  */
 225 static void
 226 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 227 {
 228         ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 229 }
 230
 231 /*
 232  * Calls the template init function of a checksum which supports context
 233  * templates and installs the template into the spa_t.
 234  */
 235 static void
 236 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
 237 {
 238         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 239
 240         if (ci->ci_tmpl_init == NULL)
 241                 return;
 242
 243         if (spa->spa_cksum_tmpls[checksum] != NULL)
 244                 return;
 245
 246         if (spa->spa_cksum_tmpls[checksum] == NULL) {
 247                 spa->spa_cksum_tmpls[checksum] =
 248                     ci->ci_tmpl_init(&spa->spa_cksum_salt);
 249         }
 250 }
 251
 252 /*
 253  * Called by a spa_t that's about to be deallocated. This steps through
 254  * all of the checksum context templates and deallocates any that were
 255  * initialized using the algorithm-specific template init function.
 256  */
 257 void
 258 zio_checksum_templates_free(spa_t *spa)
 259 {
 260         for (enum zio_checksum checksum = 0;
 261             checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
 262                 if (spa->spa_cksum_tmpls[checksum] != NULL) {
 263                         zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 264
 265                         ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
 266                         spa->spa_cksum_tmpls[checksum] = NULL;
 267                 }
 268         }
 269 }
 270
 271 static int
 272 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
 273 {
 274         uint64_t size;
 275         unsigned int checksum;
 276         zio_checksum_info_t *ci;
 277         void *ctx = NULL;
 278         zio_cksum_t actual_cksum, expected_cksum, verifier;
 279         int byteswap;
 280
 281         checksum = BP_GET_CHECKSUM(bp);
 282         size = BP_GET_PSIZE(bp);
 283
 284         if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
 285                 return (EINVAL);
 286         ci = &zio_checksum_table[checksum];
 287         if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
 288                 return (EINVAL);
 289
 290         if (spa != NULL) {
 291                 zio_checksum_template_init(checksum, (spa_t *) spa);
 292                 ctx = spa->spa_cksum_tmpls[checksum];
 293         }
 294
 295         if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 296                 zio_eck_t *eck;
 297
 298                 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
 299                     checksum == ZIO_CHECKSUM_LABEL);
 300
 301                 eck = (zio_eck_t *)((char *)data + size) - 1;
 302
 303                 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 304                         zio_checksum_gang_verifier(&verifier, bp);
 305                 else if (checksum == ZIO_CHECKSUM_LABEL)
 306                         zio_checksum_label_verifier(&verifier,
 307                             DVA_GET_OFFSET(BP_IDENTITY(bp)));
 308                 else
 309                         verifier = bp->blk_cksum;
 310
 311                 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 312
 313                 if (byteswap)
 314                         byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 315
 316                 expected_cksum = eck->zec_cksum;
 317                 eck->zec_cksum = verifier;
 318                 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
 319                 eck->zec_cksum = expected_cksum;
 320
 321                 if (byteswap)
 322                         byteswap_uint64_array(&expected_cksum,
 323                             sizeof (zio_cksum_t));
 324         } else {
 325                 expected_cksum = bp->blk_cksum;
 326                 ci->ci_func[0](data, size, ctx, &actual_cksum);
 327         }
 328
 329         if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
 330                 /* printf("ZFS: read checksum %s failed\n", ci->ci_name); */
 331                 return (EIO);
 332         }
 333
 334         return (0);
 335 }
 336
 337 static int
 338 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
 339         void *dest, uint64_t destsize)
 340 {
 341         zio_compress_info_t *ci;
 342
 343         if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
 344                 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
 345                 return (EIO);
 346         }
 347
 348         ci = &zio_compress_table[cpfunc];
 349         if (!ci->ci_decompress) {
 350                 printf("ZFS: unsupported compression algorithm %s\n",
 351                     ci->ci_name);
 352                 return (EIO);
 353         }
 354
 355         return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
 356 }
 357
 358 static uint64_t
 359 zap_hash(uint64_t salt, const char *name)
 360 {
 361         const uint8_t *cp;
 362         uint8_t c;
 363         uint64_t crc = salt;
 364
 365         ASSERT(crc != 0);
 366         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 367         for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
 368                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
 369
 370         /*
 371          * Only use 28 bits, since we need 4 bits in the cookie for the
 372          * collision differentiator.  We MUST use the high bits, since
 373          * those are the onces that we first pay attention to when
 374          * chosing the bucket.
 375          */
 376         crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
 377
 378         return (crc);
 379 }
 380
 381 static void *zfs_alloc(size_t size);
 382 static void zfs_free(void *ptr, size_t size);
 383
 384 typedef struct raidz_col {
 385         uint64_t rc_devidx;             /* child device index for I/O */
 386         uint64_t rc_offset;             /* device offset */
 387         uint64_t rc_size;               /* I/O size */
 388         void *rc_data;                  /* I/O data */
 389         int rc_error;                   /* I/O error for this device */
 390         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 391         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 392 } raidz_col_t;
 393
 394 typedef struct raidz_map {
 395         uint64_t rm_cols;               /* Regular column count */
 396         uint64_t rm_scols;              /* Count including skipped columns */
 397         uint64_t rm_bigcols;            /* Number of oversized columns */
 398         uint64_t rm_asize;              /* Actual total I/O size */
 399         uint64_t rm_missingdata;        /* Count of missing data devices */
 400         uint64_t rm_missingparity;      /* Count of missing parity devices */
 401         uint64_t rm_firstdatacol;       /* First data column/parity count */
 402         uint64_t rm_nskip;              /* Skipped sectors for padding */
 403         uint64_t rm_skipstart;          /* Column index of padding start */
 404         uintptr_t rm_reports;           /* # of referencing checksum reports */
 405         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 406         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 407         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 408 } raidz_map_t;
 409
 410 #define VDEV_RAIDZ_P            0
 411 #define VDEV_RAIDZ_Q            1
 412 #define VDEV_RAIDZ_R            2
 413
 414 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 415 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 416
 417 /*
 418  * We provide a mechanism to perform the field multiplication operation on a
 419  * 64-bit value all at once rather than a byte at a time. This works by
 420  * creating a mask from the top bit in each byte and using that to
 421  * conditionally apply the XOR of 0x1d.
 422  */
 423 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 424 { \
 425         (mask) = (x) & 0x8080808080808080ULL; \
 426         (mask) = ((mask) << 1) - ((mask) >> 7); \
 427         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 428             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 429 }
 430
 431 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 432 { \
 433         VDEV_RAIDZ_64MUL_2((x), mask); \
 434         VDEV_RAIDZ_64MUL_2((x), mask); \
 435 }
 436
 437 /*
 438  * These two tables represent powers and logs of 2 in the Galois field defined
 439  * above. These values were computed by repeatedly multiplying by 2 as above.
 440  */
 441 static const uint8_t vdev_raidz_pow2[256] = {
 442         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 443         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 444         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 445         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 446         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 447         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 448         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 449         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 450         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 451         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 452         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 453         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 454         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 455         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 456         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 457         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 458         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 459         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 460         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 461         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 462         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 463         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 464         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 465         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 466         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 467         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 468         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 469         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 470         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 471         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 472         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 473         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 474 };
 475 static const uint8_t vdev_raidz_log2[256] = {
 476         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 477         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 478         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 479         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 480         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 481         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 482         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 483         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 484         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 485         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 486         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 487         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 488         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 489         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 490         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 491         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 492         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 493         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 494         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 495         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 496         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 497         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 498         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 499         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 500         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 501         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 502         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 503         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 504         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 505         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 506         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 507         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 508 };
 509
 510 /*
 511  * Multiply a given number by 2 raised to the given power.
 512  */
 513 static uint8_t
 514 vdev_raidz_exp2(uint8_t a, int exp)
 515 {
 516         if (a == 0)
 517                 return (0);
 518
 519         ASSERT(exp >= 0);
 520         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 521
 522         exp += vdev_raidz_log2[a];
 523         if (exp > 255)
 524                 exp -= 255;
 525
 526         return (vdev_raidz_pow2[exp]);
 527 }
 528
 529 static void
 530 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 531 {
 532         uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
 533         int c;
 534
 535         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 536
 537         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 538                 src = rm->rm_col[c].rc_data;
 539                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 540                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 541
 542                 if (c == rm->rm_firstdatacol) {
 543                         ASSERT(ccount == pcount);
 544                         for (i = 0; i < ccount; i++, src++, p++) {
 545                                 *p = *src;
 546                         }
 547                 } else {
 548                         ASSERT(ccount <= pcount);
 549                         for (i = 0; i < ccount; i++, src++, p++) {
 550                                 *p ^= *src;
 551                         }
 552                 }
 553         }
 554 }
 555
 556 static void
 557 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 558 {
 559         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 560         int c;
 561
 562         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 563         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 564             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 565
 566         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 567                 src = rm->rm_col[c].rc_data;
 568                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 569                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 570
 571                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 572
 573                 if (c == rm->rm_firstdatacol) {
 574                         ASSERT(ccnt == pcnt || ccnt == 0);
 575                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 576                                 *p = *src;
 577                                 *q = *src;
 578                         }
 579                         for (; i < pcnt; i++, src++, p++, q++) {
 580                                 *p = 0;
 581                                 *q = 0;
 582                         }
 583                 } else {
 584                         ASSERT(ccnt <= pcnt);
 585
 586                         /*
 587                          * Apply the algorithm described above by multiplying
 588                          * the previous result and adding in the new value.
 589                          */
 590                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 591                                 *p ^= *src;
 592
 593                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 594                                 *q ^= *src;
 595                         }
 596
 597                         /*
 598                          * Treat short columns as though they are full of 0s.
 599                          * Note that there's therefore nothing needed for P.
 600                          */
 601                         for (; i < pcnt; i++, q++) {
 602                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 603                         }
 604                 }
 605         }
 606 }
 607
 608 static void
 609 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 610 {
 611         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 612         int c;
 613
 614         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 615         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 616             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 617         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 618             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 619
 620         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 621                 src = rm->rm_col[c].rc_data;
 622                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 623                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 624                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 625
 626                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 627
 628                 if (c == rm->rm_firstdatacol) {
 629                         ASSERT(ccnt == pcnt || ccnt == 0);
 630                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 631                                 *p = *src;
 632                                 *q = *src;
 633                                 *r = *src;
 634                         }
 635                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 636                                 *p = 0;
 637                                 *q = 0;
 638                                 *r = 0;
 639                         }
 640                 } else {
 641                         ASSERT(ccnt <= pcnt);
 642
 643                         /*
 644                          * Apply the algorithm described above by multiplying
 645                          * the previous result and adding in the new value.
 646                          */
 647                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 648                                 *p ^= *src;
 649
 650                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 651                                 *q ^= *src;
 652
 653                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 654                                 *r ^= *src;
 655                         }
 656
 657                         /*
 658                          * Treat short columns as though they are full of 0s.
 659                          * Note that there's therefore nothing needed for P.
 660                          */
 661                         for (; i < pcnt; i++, q++, r++) {
 662                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 663                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 664                         }
 665                 }
 666         }
 667 }
 668
 669 /*
 670  * Generate RAID parity in the first virtual columns according to the number of
 671  * parity columns available.
 672  */
 673 static void
 674 vdev_raidz_generate_parity(raidz_map_t *rm)
 675 {
 676         switch (rm->rm_firstdatacol) {
 677         case 1:
 678                 vdev_raidz_generate_parity_p(rm);
 679                 break;
 680         case 2:
 681                 vdev_raidz_generate_parity_pq(rm);
 682                 break;
 683         case 3:
 684                 vdev_raidz_generate_parity_pqr(rm);
 685                 break;
 686         default:
 687                 panic("invalid RAID-Z configuration");
 688         }
 689 }
 690
 691 /* BEGIN CSTYLED */
 692 /*
 693  * In the general case of reconstruction, we must solve the system of linear
 694  * equations defined by the coeffecients used to generate parity as well as
 695  * the contents of the data and parity disks. This can be expressed with
 696  * vectors for the original data (D) and the actual data (d) and parity (p)
 697  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 698  *
 699  *            __   __                     __     __
 700  *            |     |         __     __   |  p_0  |
 701  *            |  V  |         |  D_0  |   | p_m-1 |
 702  *            |     |    x    |   :   | = |  d_0  |
 703  *            |  I  |         | D_n-1 |   |   :   |
 704  *            |     |         ~~     ~~   | d_n-1 |
 705  *            ~~   ~~                     ~~     ~~
 706  *
 707  * I is simply a square identity matrix of size n, and V is a vandermonde
 708  * matrix defined by the coeffecients we chose for the various parity columns
 709  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 710  * computation as well as linear separability.
 711  *
 712  *      __               __               __     __
 713  *      |   1   ..  1 1 1 |               |  p_0  |
 714  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 715  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 716  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 717  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 718  *      |   :       : : : |   |   :   |   |  d_2  |
 719  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 720  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 721  *      |   0   ..  0 0 1 |               | d_n-1 |
 722  *      ~~               ~~               ~~     ~~
 723  *
 724  * Note that I, V, d, and p are known. To compute D, we must invert the
 725  * matrix and use the known data and parity values to reconstruct the unknown
 726  * data values. We begin by removing the rows in V|I and d|p that correspond
 727  * to failed or missing columns; we then make V|I square (n x n) and d|p
 728  * sized n by removing rows corresponding to unused parity from the bottom up
 729  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 730  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 731  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 732  *           __                               __
 733  *           |  1   1   1   1   1   1   1   1  |
 734  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 735  *           |  19 205 116  29  64  16  4   1  |      / /
 736  *           |  1   0   0   0   0   0   0   0  |     / /
 737  *           |  0   1   0   0   0   0   0   0  | <--' /
 738  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 739  *           |  0   0   0   1   0   0   0   0  |
 740  *           |  0   0   0   0   1   0   0   0  |
 741  *           |  0   0   0   0   0   1   0   0  |
 742  *           |  0   0   0   0   0   0   1   0  |
 743  *           |  0   0   0   0   0   0   0   1  |
 744  *           ~~                               ~~
 745  *           __                               __
 746  *           |  1   1   1   1   1   1   1   1  |
 747  *           | 128  64  32  16  8   4   2   1  |
 748  *           |  19 205 116  29  64  16  4   1  |
 749  *           |  1   0   0   0   0   0   0   0  |
 750  *           |  0   1   0   0   0   0   0   0  |
 751  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 752  *           |  0   0   0   1   0   0   0   0  |
 753  *           |  0   0   0   0   1   0   0   0  |
 754  *           |  0   0   0   0   0   1   0   0  |
 755  *           |  0   0   0   0   0   0   1   0  |
 756  *           |  0   0   0   0   0   0   0   1  |
 757  *           ~~                               ~~
 758  *
 759  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
 760  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
 761  * matrix is not singular.
 762  * __                                                                 __
 763  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 764  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 765  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 766  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 767  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 768  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 769  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 770  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 771  * ~~                                                                 ~~
 772  * __                                                                 __
 773  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 774  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
 775  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
 776  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 777  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 778  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 779  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 780  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 781  * ~~                                                                 ~~
 782  * __                                                                 __
 783  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 784  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 785  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
 786  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 787  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 788  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 789  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 790  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 791  * ~~                                                                 ~~
 792  * __                                                                 __
 793  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 794  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 795  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
 796  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 797  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 798  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 799  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 800  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 801  * ~~                                                                 ~~
 802  * __                                                                 __
 803  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 804  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
 805  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 806  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 807  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 808  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 809  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 810  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 811  * ~~                                                                 ~~
 812  * __                                                                 __
 813  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
 814  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
 815  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
 816  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
 817  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
 818  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
 819  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
 820  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
 821  * ~~                                                                 ~~
 822  *                   __                               __
 823  *                   |  0   0   1   0   0   0   0   0  |
 824  *                   | 167 100  5   41 159 169 217 208 |
 825  *                   | 166 100  4   40 158 168 216 209 |
 826  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
 827  *                   |  0   0   0   0   1   0   0   0  |
 828  *                   |  0   0   0   0   0   1   0   0  |
 829  *                   |  0   0   0   0   0   0   1   0  |
 830  *                   |  0   0   0   0   0   0   0   1  |
 831  *                   ~~                               ~~
 832  *
 833  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
 834  * of the missing data.
 835  *
 836  * As is apparent from the example above, the only non-trivial rows in the
 837  * inverse matrix correspond to the data disks that we're trying to
 838  * reconstruct. Indeed, those are the only rows we need as the others would
 839  * only be useful for reconstructing data known or assumed to be valid. For
 840  * that reason, we only build the coefficients in the rows that correspond to
 841  * targeted columns.
 842  */
 843 /* END CSTYLED */
 844
 845 static void
 846 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
 847     uint8_t **rows)
 848 {
 849         int i, j;
 850         int pow;
 851
 852         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
 853
 854         /*
 855          * Fill in the missing rows of interest.
 856          */
 857         for (i = 0; i < nmap; i++) {
 858                 ASSERT3S(0, <=, map[i]);
 859                 ASSERT3S(map[i], <=, 2);
 860
 861                 pow = map[i] * n;
 862                 if (pow > 255)
 863                         pow -= 255;
 864                 ASSERT(pow <= 255);
 865
 866                 for (j = 0; j < n; j++) {
 867                         pow -= map[i];
 868                         if (pow < 0)
 869                                 pow += 255;
 870                         rows[i][j] = vdev_raidz_pow2[pow];
 871                 }
 872         }
 873 }
 874
 875 static void
 876 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
 877     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
 878 {
 879         int i, j, ii, jj;
 880         uint8_t log;
 881
 882         /*
 883          * Assert that the first nmissing entries from the array of used
 884          * columns correspond to parity columns and that subsequent entries
 885          * correspond to data columns.
 886          */
 887         for (i = 0; i < nmissing; i++) {
 888                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
 889         }
 890         for (; i < n; i++) {
 891                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
 892         }
 893
 894         /*
 895          * First initialize the storage where we'll compute the inverse rows.
 896          */
 897         for (i = 0; i < nmissing; i++) {
 898                 for (j = 0; j < n; j++) {
 899                         invrows[i][j] = (i == j) ? 1 : 0;
 900                 }
 901         }
 902
 903         /*
 904          * Subtract all trivial rows from the rows of consequence.
 905          */
 906         for (i = 0; i < nmissing; i++) {
 907                 for (j = nmissing; j < n; j++) {
 908                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
 909                         jj = used[j] - rm->rm_firstdatacol;
 910                         ASSERT3S(jj, <, n);
 911                         invrows[i][j] = rows[i][jj];
 912                         rows[i][jj] = 0;
 913                 }
 914         }
 915
 916         /*
 917          * For each of the rows of interest, we must normalize it and subtract
 918          * a multiple of it from the other rows.
 919          */
 920         for (i = 0; i < nmissing; i++) {
 921                 for (j = 0; j < missing[i]; j++) {
 922                         ASSERT3U(rows[i][j], ==, 0);
 923                 }
 924                 ASSERT3U(rows[i][missing[i]], !=, 0);
 925
 926                 /*
 927                  * Compute the inverse of the first element and multiply each
 928                  * element in the row by that value.
 929                  */
 930                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
 931
 932                 for (j = 0; j < n; j++) {
 933                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
 934                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
 935                 }
 936
 937                 for (ii = 0; ii < nmissing; ii++) {
 938                         if (i == ii)
 939                                 continue;
 940
 941                         ASSERT3U(rows[ii][missing[i]], !=, 0);
 942
 943                         log = vdev_raidz_log2[rows[ii][missing[i]]];
 944
 945                         for (j = 0; j < n; j++) {
 946                                 rows[ii][j] ^=
 947                                     vdev_raidz_exp2(rows[i][j], log);
 948                                 invrows[ii][j] ^=
 949                                     vdev_raidz_exp2(invrows[i][j], log);
 950                         }
 951                 }
 952         }
 953
 954         /*
 955          * Verify that the data that is left in the rows are properly part of
 956          * an identity matrix.
 957          */
 958         for (i = 0; i < nmissing; i++) {
 959                 for (j = 0; j < n; j++) {
 960                         if (j == missing[i]) {
 961                                 ASSERT3U(rows[i][j], ==, 1);
 962                         } else {
 963                                 ASSERT3U(rows[i][j], ==, 0);
 964                         }
 965                 }
 966         }
 967 }
 968
 969 static void
 970 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
 971     int *missing, uint8_t **invrows, const uint8_t *used)
 972 {
 973         int i, j, x, cc, c;
 974         uint8_t *src;
 975         uint64_t ccount;
 976         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
 977         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
 978         uint8_t log, val;
 979         int ll;
 980         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
 981         uint8_t *p, *pp;
 982         size_t psize;
 983
 984         log = 0;        /* gcc */
 985         psize = sizeof (invlog[0][0]) * n * nmissing;
 986         p = zfs_alloc(psize);
 987
 988         for (pp = p, i = 0; i < nmissing; i++) {
 989                 invlog[i] = pp;
 990                 pp += n;
 991         }
 992
 993         for (i = 0; i < nmissing; i++) {
 994                 for (j = 0; j < n; j++) {
 995                         ASSERT3U(invrows[i][j], !=, 0);
 996                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
 997                 }
 998         }
 999
1000         for (i = 0; i < n; i++) {
1001                 c = used[i];
1002                 ASSERT3U(c, <, rm->rm_cols);
1003
1004                 src = rm->rm_col[c].rc_data;
1005                 ccount = rm->rm_col[c].rc_size;
1006                 for (j = 0; j < nmissing; j++) {
1007                         cc = missing[j] + rm->rm_firstdatacol;
1008                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1009                         ASSERT3U(cc, <, rm->rm_cols);
1010                         ASSERT3U(cc, !=, c);
1011
1012                         dst[j] = rm->rm_col[cc].rc_data;
1013                         dcount[j] = rm->rm_col[cc].rc_size;
1014                 }
1015
1016                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1017
1018                 for (x = 0; x < ccount; x++, src++) {
1019                         if (*src != 0)
1020                                 log = vdev_raidz_log2[*src];
1021
1022                         for (cc = 0; cc < nmissing; cc++) {
1023                                 if (x >= dcount[cc])
1024                                         continue;
1025
1026                                 if (*src == 0) {
1027                                         val = 0;
1028                                 } else {
1029                                         if ((ll = log + invlog[cc][i]) >= 255)
1030                                                 ll -= 255;
1031                                         val = vdev_raidz_pow2[ll];
1032                                 }
1033
1034                                 if (i == 0)
1035                                         dst[cc][x] = val;
1036                                 else
1037                                         dst[cc][x] ^= val;
1038                         }
1039                 }
1040         }
1041
1042         zfs_free(p, psize);
1043 }
1044
1045 static int
1046 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1047 {
1048         int n, i, c, t, tt;
1049         int nmissing_rows;
1050         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1051         int parity_map[VDEV_RAIDZ_MAXPARITY];
1052
1053         uint8_t *p, *pp;
1054         size_t psize;
1055
1056         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1057         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1058         uint8_t *used;
1059
1060         int code = 0;
1061
1062
1063         n = rm->rm_cols - rm->rm_firstdatacol;
1064
1065         /*
1066          * Figure out which data columns are missing.
1067          */
1068         nmissing_rows = 0;
1069         for (t = 0; t < ntgts; t++) {
1070                 if (tgts[t] >= rm->rm_firstdatacol) {
1071                         missing_rows[nmissing_rows++] =
1072                             tgts[t] - rm->rm_firstdatacol;
1073                 }
1074         }
1075
1076         /*
1077          * Figure out which parity columns to use to help generate the missing
1078          * data columns.
1079          */
1080         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1081                 ASSERT(tt < ntgts);
1082                 ASSERT(c < rm->rm_firstdatacol);
1083
1084                 /*
1085                  * Skip any targeted parity columns.
1086                  */
1087                 if (c == tgts[tt]) {
1088                         tt++;
1089                         continue;
1090                 }
1091
1092                 code |= 1 << c;
1093
1094                 parity_map[i] = c;
1095                 i++;
1096         }
1097
1098         ASSERT(code != 0);
1099         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1100
1101         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1102             nmissing_rows * n + sizeof (used[0]) * n;
1103         p = kmem_alloc(psize, KM_SLEEP);
1104
1105         for (pp = p, i = 0; i < nmissing_rows; i++) {
1106                 rows[i] = pp;
1107                 pp += n;
1108                 invrows[i] = pp;
1109                 pp += n;
1110         }
1111         used = pp;
1112
1113         for (i = 0; i < nmissing_rows; i++) {
1114                 used[i] = parity_map[i];
1115         }
1116
1117         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1118                 if (tt < nmissing_rows &&
1119                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1120                         tt++;
1121                         continue;
1122                 }
1123
1124                 ASSERT3S(i, <, n);
1125                 used[i] = c;
1126                 i++;
1127         }
1128
1129         /*
1130          * Initialize the interesting rows of the matrix.
1131          */
1132         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1133
1134         /*
1135          * Invert the matrix.
1136          */
1137         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1138             invrows, used);
1139
1140         /*
1141          * Reconstruct the missing data using the generated matrix.
1142          */
1143         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1144             invrows, used);
1145
1146         kmem_free(p, psize);
1147
1148         return (code);
1149 }
1150
1151 static int
1152 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1153 {
1154         int tgts[VDEV_RAIDZ_MAXPARITY];
1155         int ntgts;
1156         int i, c;
1157         int code;
1158         int nbadparity, nbaddata;
1159
1160         /*
1161          * The tgts list must already be sorted.
1162          */
1163         for (i = 1; i < nt; i++) {
1164                 ASSERT(t[i] > t[i - 1]);
1165         }
1166
1167         nbadparity = rm->rm_firstdatacol;
1168         nbaddata = rm->rm_cols - nbadparity;
1169         ntgts = 0;
1170         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1171                 if (i < nt && c == t[i]) {
1172                         tgts[ntgts++] = c;
1173                         i++;
1174                 } else if (rm->rm_col[c].rc_error != 0) {
1175                         tgts[ntgts++] = c;
1176                 } else if (c >= rm->rm_firstdatacol) {
1177                         nbaddata--;
1178                 } else {
1179                         nbadparity--;
1180                 }
1181         }
1182
1183         ASSERT(ntgts >= nt);
1184         ASSERT(nbaddata >= 0);
1185         ASSERT(nbaddata + nbadparity == ntgts);
1186
1187         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1188         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1189         ASSERT(code > 0);
1190         return (code);
1191 }
1192
1193 static raidz_map_t *
1194 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1195     uint64_t dcols, uint64_t nparity)
1196 {
1197         raidz_map_t *rm;
1198         uint64_t b = offset >> unit_shift;
1199         uint64_t s = size >> unit_shift;
1200         uint64_t f = b % dcols;
1201         uint64_t o = (b / dcols) << unit_shift;
1202         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1203
1204         q = s / (dcols - nparity);
1205         r = s - q * (dcols - nparity);
1206         bc = (r == 0 ? 0 : r + nparity);
1207         tot = s + nparity * (q + (r == 0 ? 0 : 1));
1208
1209         if (q == 0) {
1210                 acols = bc;
1211                 scols = MIN(dcols, roundup(bc, nparity + 1));
1212         } else {
1213                 acols = dcols;
1214                 scols = dcols;
1215         }
1216
1217         ASSERT3U(acols, <=, scols);
1218
1219         rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1220
1221         rm->rm_cols = acols;
1222         rm->rm_scols = scols;
1223         rm->rm_bigcols = bc;
1224         rm->rm_skipstart = bc;
1225         rm->rm_missingdata = 0;
1226         rm->rm_missingparity = 0;
1227         rm->rm_firstdatacol = nparity;
1228         rm->rm_reports = 0;
1229         rm->rm_freed = 0;
1230         rm->rm_ecksuminjected = 0;
1231
1232         asize = 0;
1233
1234         for (c = 0; c < scols; c++) {
1235                 col = f + c;
1236                 coff = o;
1237                 if (col >= dcols) {
1238                         col -= dcols;
1239                         coff += 1ULL << unit_shift;
1240                 }
1241                 rm->rm_col[c].rc_devidx = col;
1242                 rm->rm_col[c].rc_offset = coff;
1243                 rm->rm_col[c].rc_data = NULL;
1244                 rm->rm_col[c].rc_error = 0;
1245                 rm->rm_col[c].rc_tried = 0;
1246                 rm->rm_col[c].rc_skipped = 0;
1247
1248                 if (c >= acols)
1249                         rm->rm_col[c].rc_size = 0;
1250                 else if (c < bc)
1251                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1252                 else
1253                         rm->rm_col[c].rc_size = q << unit_shift;
1254
1255                 asize += rm->rm_col[c].rc_size;
1256         }
1257
1258         ASSERT3U(asize, ==, tot << unit_shift);
1259         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1260         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1261         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1262         ASSERT3U(rm->rm_nskip, <=, nparity);
1263
1264         for (c = 0; c < rm->rm_firstdatacol; c++)
1265                 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1266
1267         rm->rm_col[c].rc_data = data;
1268
1269         for (c = c + 1; c < acols; c++)
1270                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1271                     rm->rm_col[c - 1].rc_size;
1272
1273         /*
1274          * If all data stored spans all columns, there's a danger that parity
1275          * will always be on the same device and, since parity isn't read
1276          * during normal operation, that that device's I/O bandwidth won't be
1277          * used effectively. We therefore switch the parity every 1MB.
1278          *
1279          * ... at least that was, ostensibly, the theory. As a practical
1280          * matter unless we juggle the parity between all devices evenly, we
1281          * won't see any benefit. Further, occasional writes that aren't a
1282          * multiple of the LCM of the number of children and the minimum
1283          * stripe width are sufficient to avoid pessimal behavior.
1284          * Unfortunately, this decision created an implicit on-disk format
1285          * requirement that we need to support for all eternity, but only
1286          * for single-parity RAID-Z.
1287          *
1288          * If we intend to skip a sector in the zeroth column for padding
1289          * we must make sure to note this swap. We will never intend to
1290          * skip the first column since at least one data and one parity
1291          * column must appear in each row.
1292          */
1293         ASSERT(rm->rm_cols >= 2);
1294         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1295
1296         if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1297                 devidx = rm->rm_col[0].rc_devidx;
1298                 o = rm->rm_col[0].rc_offset;
1299                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1300                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1301                 rm->rm_col[1].rc_devidx = devidx;
1302                 rm->rm_col[1].rc_offset = o;
1303
1304                 if (rm->rm_skipstart == 0)
1305                         rm->rm_skipstart = 1;
1306         }
1307
1308         return (rm);
1309 }
1310
1311 static void
1312 vdev_raidz_map_free(raidz_map_t *rm)
1313 {
1314         int c;
1315
1316         for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1317                 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1318
1319         zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1320 }
1321
1322 static vdev_t *
1323 vdev_child(vdev_t *pvd, uint64_t devidx)
1324 {
1325         vdev_t *cvd;
1326
1327         STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1328                 if (cvd->v_id == devidx)
1329                         break;
1330         }
1331
1332         return (cvd);
1333 }
1334
1335 /*
1336  * We keep track of whether or not there were any injected errors, so that
1337  * any ereports we generate can note it.
1338  */
1339 static int
1340 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1341     uint64_t size)
1342 {
1343
1344         return (zio_checksum_verify(spa, bp, data));
1345 }
1346
1347 /*
1348  * Generate the parity from the data columns. If we tried and were able to
1349  * read the parity without error, verify that the generated parity matches the
1350  * data we read. If it doesn't, we fire off a checksum error. Return the
1351  * number such failures.
1352  */
1353 static int
1354 raidz_parity_verify(raidz_map_t *rm)
1355 {
1356         void *orig[VDEV_RAIDZ_MAXPARITY];
1357         int c, ret = 0;
1358         raidz_col_t *rc;
1359
1360         for (c = 0; c < rm->rm_firstdatacol; c++) {
1361                 rc = &rm->rm_col[c];
1362                 if (!rc->rc_tried || rc->rc_error != 0)
1363                         continue;
1364                 orig[c] = zfs_alloc(rc->rc_size);
1365                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1366         }
1367
1368         vdev_raidz_generate_parity(rm);
1369
1370         for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1371                 rc = &rm->rm_col[c];
1372                 if (!rc->rc_tried || rc->rc_error != 0)
1373                         continue;
1374                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1375                         rc->rc_error = ECKSUM;
1376                         ret++;
1377                 }
1378                 zfs_free(orig[c], rc->rc_size);
1379         }
1380
1381         return (ret);
1382 }
1383
1384 /*
1385  * Iterate over all combinations of bad data and attempt a reconstruction.
1386  * Note that the algorithm below is non-optimal because it doesn't take into
1387  * account how reconstruction is actually performed. For example, with
1388  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1389  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1390  * cases we'd only use parity information in column 0.
1391  */
1392 static int
1393 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1394     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1395 {
1396         raidz_col_t *rc;
1397         void *orig[VDEV_RAIDZ_MAXPARITY];
1398         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1399         int *tgts = &tstore[1];
1400         int current, next, i, c, n;
1401         int code, ret = 0;
1402
1403         ASSERT(total_errors < rm->rm_firstdatacol);
1404
1405         /*
1406          * This simplifies one edge condition.
1407          */
1408         tgts[-1] = -1;
1409
1410         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1411                 /*
1412                  * Initialize the targets array by finding the first n columns
1413                  * that contain no error.
1414                  *
1415                  * If there were no data errors, we need to ensure that we're
1416                  * always explicitly attempting to reconstruct at least one
1417                  * data column. To do this, we simply push the highest target
1418                  * up into the data columns.
1419                  */
1420                 for (c = 0, i = 0; i < n; i++) {
1421                         if (i == n - 1 && data_errors == 0 &&
1422                             c < rm->rm_firstdatacol) {
1423                                 c = rm->rm_firstdatacol;
1424                         }
1425
1426                         while (rm->rm_col[c].rc_error != 0) {
1427                                 c++;
1428                                 ASSERT3S(c, <, rm->rm_cols);
1429                         }
1430
1431                         tgts[i] = c++;
1432                 }
1433
1434                 /*
1435                  * Setting tgts[n] simplifies the other edge condition.
1436                  */
1437                 tgts[n] = rm->rm_cols;
1438
1439                 /*
1440                  * These buffers were allocated in previous iterations.
1441                  */
1442                 for (i = 0; i < n - 1; i++) {
1443                         ASSERT(orig[i] != NULL);
1444                 }
1445
1446                 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1447
1448                 current = 0;
1449                 next = tgts[current];
1450
1451                 while (current != n) {
1452                         tgts[current] = next;
1453                         current = 0;
1454
1455                         /*
1456                          * Save off the original data that we're going to
1457                          * attempt to reconstruct.
1458                          */
1459                         for (i = 0; i < n; i++) {
1460                                 ASSERT(orig[i] != NULL);
1461                                 c = tgts[i];
1462                                 ASSERT3S(c, >=, 0);
1463                                 ASSERT3S(c, <, rm->rm_cols);
1464                                 rc = &rm->rm_col[c];
1465                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1466                         }
1467
1468                         /*
1469                          * Attempt a reconstruction and exit the outer loop on
1470                          * success.
1471                          */
1472                         code = vdev_raidz_reconstruct(rm, tgts, n);
1473                         if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1474                                 for (i = 0; i < n; i++) {
1475                                         c = tgts[i];
1476                                         rc = &rm->rm_col[c];
1477                                         ASSERT(rc->rc_error == 0);
1478                                         rc->rc_error = ECKSUM;
1479                                 }
1480
1481                                 ret = code;
1482                                 goto done;
1483                         }
1484
1485                         /*
1486                          * Restore the original data.
1487                          */
1488                         for (i = 0; i < n; i++) {
1489                                 c = tgts[i];
1490                                 rc = &rm->rm_col[c];
1491                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1492                         }
1493
1494                         do {
1495                                 /*
1496                                  * Find the next valid column after the current
1497                                  * position..
1498                                  */
1499                                 for (next = tgts[current] + 1;
1500                                     next < rm->rm_cols &&
1501                                     rm->rm_col[next].rc_error != 0; next++)
1502                                         continue;
1503
1504                                 ASSERT(next <= tgts[current + 1]);
1505
1506                                 /*
1507                                  * If that spot is available, we're done here.
1508                                  */
1509                                 if (next != tgts[current + 1])
1510                                         break;
1511
1512                                 /*
1513                                  * Otherwise, find the next valid column after
1514                                  * the previous position.
1515                                  */
1516                                 for (c = tgts[current - 1] + 1;
1517                                     rm->rm_col[c].rc_error != 0; c++)
1518                                         continue;
1519
1520                                 tgts[current] = c;
1521                                 current++;
1522
1523                         } while (current != n);
1524                 }
1525         }
1526         n--;
1527 done:
1528         for (i = n - 1; i >= 0; i--) {
1529                 zfs_free(orig[i], rm->rm_col[0].rc_size);
1530         }
1531
1532         return (ret);
1533 }
1534
1535 static int
1536 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1537     off_t offset, size_t bytes)
1538 {
1539         vdev_t *tvd = vd->v_top;
1540         vdev_t *cvd;
1541         raidz_map_t *rm;
1542         raidz_col_t *rc;
1543         int c, error;
1544         int unexpected_errors;
1545         int parity_errors;
1546         int parity_untried;
1547         int data_errors;
1548         int total_errors;
1549         int n;
1550         int tgts[VDEV_RAIDZ_MAXPARITY];
1551         int code;
1552
1553         rc = NULL;      /* gcc */
1554         error = 0;
1555
1556         rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1557             vd->v_nchildren, vd->v_nparity);
1558
1559         /*
1560          * Iterate over the columns in reverse order so that we hit the parity
1561          * last -- any errors along the way will force us to read the parity.
1562          */
1563         for (c = rm->rm_cols - 1; c >= 0; c--) {
1564                 rc = &rm->rm_col[c];
1565                 cvd = vdev_child(vd, rc->rc_devidx);
1566                 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1567                         if (c >= rm->rm_firstdatacol)
1568                                 rm->rm_missingdata++;
1569                         else
1570                                 rm->rm_missingparity++;
1571                         rc->rc_error = ENXIO;
1572                         rc->rc_tried = 1;       /* don't even try */
1573                         rc->rc_skipped = 1;
1574                         continue;
1575                 }
1576 #if 0           /* XXX: Too hard for the boot code. */
1577                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1578                         if (c >= rm->rm_firstdatacol)
1579                                 rm->rm_missingdata++;
1580                         else
1581                                 rm->rm_missingparity++;
1582                         rc->rc_error = ESTALE;
1583                         rc->rc_skipped = 1;
1584                         continue;
1585                 }
1586 #endif
1587                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1588                         rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1589                             rc->rc_offset, rc->rc_size);
1590                         rc->rc_tried = 1;
1591                         rc->rc_skipped = 0;
1592                 }
1593         }
1594
1595 reconstruct:
1596         unexpected_errors = 0;
1597         parity_errors = 0;
1598         parity_untried = 0;
1599         data_errors = 0;
1600         total_errors = 0;
1601
1602         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1603         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1604
1605         for (c = 0; c < rm->rm_cols; c++) {
1606                 rc = &rm->rm_col[c];
1607
1608                 if (rc->rc_error) {
1609                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1610
1611                         if (c < rm->rm_firstdatacol)
1612                                 parity_errors++;
1613                         else
1614                                 data_errors++;
1615
1616                         if (!rc->rc_skipped)
1617                                 unexpected_errors++;
1618
1619                         total_errors++;
1620                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1621                         parity_untried++;
1622                 }
1623         }
1624
1625         /*
1626          * There are three potential phases for a read:
1627          *      1. produce valid data from the columns read
1628          *      2. read all disks and try again
1629          *      3. perform combinatorial reconstruction
1630          *
1631          * Each phase is progressively both more expensive and less likely to
1632          * occur. If we encounter more errors than we can repair or all phases
1633          * fail, we have no choice but to return an error.
1634          */
1635
1636         /*
1637          * If the number of errors we saw was correctable -- less than or equal
1638          * to the number of parity disks read -- attempt to produce data that
1639          * has a valid checksum. Naturally, this case applies in the absence of
1640          * any errors.
1641          */
1642         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1643                 if (data_errors == 0) {
1644                         if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1645                                 /*
1646                                  * If we read parity information (unnecessarily
1647                                  * as it happens since no reconstruction was
1648                                  * needed) regenerate and verify the parity.
1649                                  * We also regenerate parity when resilvering
1650                                  * so we can write it out to the failed device
1651                                  * later.
1652                                  */
1653                                 if (parity_errors + parity_untried <
1654                                     rm->rm_firstdatacol) {
1655                                         n = raidz_parity_verify(rm);
1656                                         unexpected_errors += n;
1657                                         ASSERT(parity_errors + n <=
1658                                             rm->rm_firstdatacol);
1659                                 }
1660                                 goto done;
1661                         }
1662                 } else {
1663                         /*
1664                          * We either attempt to read all the parity columns or
1665                          * none of them. If we didn't try to read parity, we
1666                          * wouldn't be here in the correctable case. There must
1667                          * also have been fewer parity errors than parity
1668                          * columns or, again, we wouldn't be in this code path.
1669                          */
1670                         ASSERT(parity_untried == 0);
1671                         ASSERT(parity_errors < rm->rm_firstdatacol);
1672
1673                         /*
1674                          * Identify the data columns that reported an error.
1675                          */
1676                         n = 0;
1677                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1678                                 rc = &rm->rm_col[c];
1679                                 if (rc->rc_error != 0) {
1680                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1681                                         tgts[n++] = c;
1682                                 }
1683                         }
1684
1685                         ASSERT(rm->rm_firstdatacol >= n);
1686
1687                         code = vdev_raidz_reconstruct(rm, tgts, n);
1688
1689                         if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1690                                 /*
1691                                  * If we read more parity disks than were used
1692                                  * for reconstruction, confirm that the other
1693                                  * parity disks produced correct data. This
1694                                  * routine is suboptimal in that it regenerates
1695                                  * the parity that we already used in addition
1696                                  * to the parity that we're attempting to
1697                                  * verify, but this should be a relatively
1698                                  * uncommon case, and can be optimized if it
1699                                  * becomes a problem. Note that we regenerate
1700                                  * parity when resilvering so we can write it
1701                                  * out to failed devices later.
1702                                  */
1703                                 if (parity_errors < rm->rm_firstdatacol - n) {
1704                                         n = raidz_parity_verify(rm);
1705                                         unexpected_errors += n;
1706                                         ASSERT(parity_errors + n <=
1707                                             rm->rm_firstdatacol);
1708                                 }
1709
1710                                 goto done;
1711                         }
1712                 }
1713         }
1714
1715         /*
1716          * This isn't a typical situation -- either we got a read
1717          * error or a child silently returned bad data. Read every
1718          * block so we can try again with as much data and parity as
1719          * we can track down. If we've already been through once
1720          * before, all children will be marked as tried so we'll
1721          * proceed to combinatorial reconstruction.
1722          */
1723         unexpected_errors = 1;
1724         rm->rm_missingdata = 0;
1725         rm->rm_missingparity = 0;
1726
1727         n = 0;
1728         for (c = 0; c < rm->rm_cols; c++) {
1729                 rc = &rm->rm_col[c];
1730
1731                 if (rc->rc_tried)
1732                         continue;
1733
1734                 cvd = vdev_child(vd, rc->rc_devidx);
1735                 ASSERT(cvd != NULL);
1736                 rc->rc_error = cvd->v_read(cvd, NULL,
1737                     rc->rc_data, rc->rc_offset, rc->rc_size);
1738                 if (rc->rc_error == 0)
1739                         n++;
1740                 rc->rc_tried = 1;
1741                 rc->rc_skipped = 0;
1742         }
1743         /*
1744          * If we managed to read anything more, retry the
1745          * reconstruction.
1746          */
1747         if (n > 0)
1748                 goto reconstruct;
1749
1750         /*
1751          * At this point we've attempted to reconstruct the data given the
1752          * errors we detected, and we've attempted to read all columns. There
1753          * must, therefore, be one or more additional problems -- silent errors
1754          * resulting in invalid data rather than explicit I/O errors resulting
1755          * in absent data. We check if there is enough additional data to
1756          * possibly reconstruct the data and then perform combinatorial
1757          * reconstruction over all possible combinations. If that fails,
1758          * we're cooked.
1759          */
1760         if (total_errors > rm->rm_firstdatacol) {
1761                 error = EIO;
1762         } else if (total_errors < rm->rm_firstdatacol &&
1763             (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1764              total_errors, data_errors)) != 0) {
1765                 /*
1766                  * If we didn't use all the available parity for the
1767                  * combinatorial reconstruction, verify that the remaining
1768                  * parity is correct.
1769                  */
1770                 if (code != (1 << rm->rm_firstdatacol) - 1)
1771                         (void) raidz_parity_verify(rm);
1772         } else {
1773                 /*
1774                  * We're here because either:
1775                  *
1776                  *      total_errors == rm_first_datacol, or
1777                  *      vdev_raidz_combrec() failed
1778                  *
1779                  * In either case, there is enough bad data to prevent
1780                  * reconstruction.
1781                  *
1782                  * Start checksum ereports for all children which haven't
1783                  * failed, and the IO wasn't speculative.
1784                  */
1785                 error = ECKSUM;
1786         }
1787
1788 done:
1789         vdev_raidz_map_free(rm);
1790
1791         return (error);
1792 }