4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012 by Delphix. All rights reserved.
27 #include <sys/zfs_context.h>
29 #include <sys/vdev_impl.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
36 * Virtual device vector for RAID-Z.
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
66 * (A * 2)_4 = A_3 + A_7
67 * (A * 2)_3 = A_2 + A_7
68 * (A * 2)_2 = A_1 + A_7
72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73 * As an aside, this multiplication is derived from the error correcting
74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
76 * Observe that any number in the field (except for 0) can be expressed as a
77 * power of 2 -- a generator for the field. We store a table of the powers of
78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80 * than field addition). The inverse of a field element A (A^-1) is therefore
81 * A ^ (255 - 1) = A^254.
83 * The up-to-three parity columns, P, Q, R over several data columns,
84 * D_0, ... D_n-1, can be expressed by field operations:
86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 * independent coefficients. (There are no additional coefficients that have
95 * this property which is why the uncorrected Plank method breaks down.)
97 * See the reconstruction code below for how P, Q and R can used individually
98 * or in concert to recover missing data columns.
101 typedef struct raidz_col
{
102 uint64_t rc_devidx
; /* child device index for I/O */
103 uint64_t rc_offset
; /* device offset */
104 uint64_t rc_size
; /* I/O size */
105 void *rc_data
; /* I/O data */
106 void *rc_gdata
; /* used to store the "good" version */
107 int rc_error
; /* I/O error for this device */
108 uint8_t rc_tried
; /* Did we attempt this I/O column? */
109 uint8_t rc_skipped
; /* Did we skip this I/O column? */
112 typedef struct raidz_map
{
113 uint64_t rm_cols
; /* Regular column count */
114 uint64_t rm_scols
; /* Count including skipped columns */
115 uint64_t rm_bigcols
; /* Number of oversized columns */
116 uint64_t rm_asize
; /* Actual total I/O size */
117 uint64_t rm_missingdata
; /* Count of missing data devices */
118 uint64_t rm_missingparity
; /* Count of missing parity devices */
119 uint64_t rm_firstdatacol
; /* First data column/parity count */
120 uint64_t rm_nskip
; /* Skipped sectors for padding */
121 uint64_t rm_skipstart
; /* Column index of padding start */
122 void *rm_datacopy
; /* rm_asize-buffer of copied data */
123 uintptr_t rm_reports
; /* # of referencing checksum reports */
124 uint8_t rm_freed
; /* map no longer has referencing ZIO */
125 uint8_t rm_ecksuminjected
; /* checksum error was injected */
126 raidz_col_t rm_col
[1]; /* Flexible array of I/O columns */
129 #define VDEV_RAIDZ_P 0
130 #define VDEV_RAIDZ_Q 1
131 #define VDEV_RAIDZ_R 2
133 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
137 * We provide a mechanism to perform the field multiplication operation on a
138 * 64-bit value all at once rather than a byte at a time. This works by
139 * creating a mask from the top bit in each byte and using that to
140 * conditionally apply the XOR of 0x1d.
142 #define VDEV_RAIDZ_64MUL_2(x, mask) \
144 (mask) = (x) & 0x8080808080808080ULL; \
145 (mask) = ((mask) << 1) - ((mask) >> 7); \
146 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
147 ((mask) & 0x1d1d1d1d1d1d1d1d); \
150 #define VDEV_RAIDZ_64MUL_4(x, mask) \
152 VDEV_RAIDZ_64MUL_2((x), mask); \
153 VDEV_RAIDZ_64MUL_2((x), mask); \
157 * Force reconstruction to use the general purpose method.
159 int vdev_raidz_default_to_general
;
162 * These two tables represent powers and logs of 2 in the Galois field defined
163 * above. These values were computed by repeatedly multiplying by 2 as above.
165 static const uint8_t vdev_raidz_pow2
[256] = {
166 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
189 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
199 static const uint8_t vdev_raidz_log2
[256] = {
200 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
234 static void vdev_raidz_generate_parity(raidz_map_t
*rm
);
237 * Multiply a given number by 2 raised to the given power.
240 vdev_raidz_exp2(uint_t a
, int exp
)
246 ASSERT(vdev_raidz_log2
[a
] > 0 || a
== 1);
248 exp
+= vdev_raidz_log2
[a
];
252 return (vdev_raidz_pow2
[exp
]);
256 vdev_raidz_map_free(raidz_map_t
*rm
)
261 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
262 zio_buf_free(rm
->rm_col
[c
].rc_data
, rm
->rm_col
[c
].rc_size
);
264 if (rm
->rm_col
[c
].rc_gdata
!= NULL
)
265 zio_buf_free(rm
->rm_col
[c
].rc_gdata
,
266 rm
->rm_col
[c
].rc_size
);
270 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++)
271 size
+= rm
->rm_col
[c
].rc_size
;
273 if (rm
->rm_datacopy
!= NULL
)
274 zio_buf_free(rm
->rm_datacopy
, size
);
276 kmem_free(rm
, offsetof(raidz_map_t
, rm_col
[rm
->rm_scols
]));
280 vdev_raidz_map_free_vsd(zio_t
*zio
)
282 raidz_map_t
*rm
= zio
->io_vsd
;
284 ASSERT3U(rm
->rm_freed
, ==, 0);
287 if (rm
->rm_reports
== 0)
288 vdev_raidz_map_free(rm
);
293 vdev_raidz_cksum_free(void *arg
, size_t ignored
)
295 raidz_map_t
*rm
= arg
;
297 ASSERT3U(rm
->rm_reports
, >, 0);
299 if (--rm
->rm_reports
== 0 && rm
->rm_freed
!= 0)
300 vdev_raidz_map_free(rm
);
304 vdev_raidz_cksum_finish(zio_cksum_report_t
*zcr
, const void *good_data
)
306 raidz_map_t
*rm
= zcr
->zcr_cbdata
;
307 size_t c
= zcr
->zcr_cbinfo
;
310 const char *good
= NULL
;
311 const char *bad
= rm
->rm_col
[c
].rc_data
;
313 if (good_data
== NULL
) {
314 zfs_ereport_finish_checksum(zcr
, NULL
, NULL
, B_FALSE
);
318 if (c
< rm
->rm_firstdatacol
) {
320 * The first time through, calculate the parity blocks for
321 * the good data (this relies on the fact that the good
322 * data never changes for a given logical ZIO)
324 if (rm
->rm_col
[0].rc_gdata
== NULL
) {
325 char *bad_parity
[VDEV_RAIDZ_MAXPARITY
];
329 * Set up the rm_col[]s to generate the parity for
330 * good_data, first saving the parity bufs and
331 * replacing them with buffers to hold the result.
333 for (x
= 0; x
< rm
->rm_firstdatacol
; x
++) {
334 bad_parity
[x
] = rm
->rm_col
[x
].rc_data
;
335 rm
->rm_col
[x
].rc_data
= rm
->rm_col
[x
].rc_gdata
=
336 zio_buf_alloc(rm
->rm_col
[x
].rc_size
);
339 /* fill in the data columns from good_data */
340 buf
= (char *)good_data
;
341 for (; x
< rm
->rm_cols
; x
++) {
342 rm
->rm_col
[x
].rc_data
= buf
;
343 buf
+= rm
->rm_col
[x
].rc_size
;
347 * Construct the parity from the good data.
349 vdev_raidz_generate_parity(rm
);
351 /* restore everything back to its original state */
352 for (x
= 0; x
< rm
->rm_firstdatacol
; x
++)
353 rm
->rm_col
[x
].rc_data
= bad_parity
[x
];
355 buf
= rm
->rm_datacopy
;
356 for (x
= rm
->rm_firstdatacol
; x
< rm
->rm_cols
; x
++) {
357 rm
->rm_col
[x
].rc_data
= buf
;
358 buf
+= rm
->rm_col
[x
].rc_size
;
362 ASSERT3P(rm
->rm_col
[c
].rc_gdata
, !=, NULL
);
363 good
= rm
->rm_col
[c
].rc_gdata
;
365 /* adjust good_data to point at the start of our column */
368 for (x
= rm
->rm_firstdatacol
; x
< c
; x
++)
369 good
+= rm
->rm_col
[x
].rc_size
;
372 /* we drop the ereport if it ends up that the data was good */
373 zfs_ereport_finish_checksum(zcr
, good
, bad
, B_TRUE
);
377 * Invoked indirectly by zfs_ereport_start_checksum(), called
378 * below when our read operation fails completely. The main point
379 * is to keep a copy of everything we read from disk, so that at
380 * vdev_raidz_cksum_finish() time we can compare it with the good data.
383 vdev_raidz_cksum_report(zio_t
*zio
, zio_cksum_report_t
*zcr
, void *arg
)
385 size_t c
= (size_t)(uintptr_t)arg
;
388 raidz_map_t
*rm
= zio
->io_vsd
;
391 /* set up the report and bump the refcount */
392 zcr
->zcr_cbdata
= rm
;
394 zcr
->zcr_finish
= vdev_raidz_cksum_finish
;
395 zcr
->zcr_free
= vdev_raidz_cksum_free
;
398 ASSERT3U(rm
->rm_reports
, >, 0);
400 if (rm
->rm_datacopy
!= NULL
)
404 * It's the first time we're called for this raidz_map_t, so we need
405 * to copy the data aside; there's no guarantee that our zio's buffer
406 * won't be re-used for something else.
408 * Our parity data is already in separate buffers, so there's no need
413 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++)
414 size
+= rm
->rm_col
[c
].rc_size
;
416 buf
= rm
->rm_datacopy
= zio_buf_alloc(size
);
418 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
419 raidz_col_t
*col
= &rm
->rm_col
[c
];
421 bcopy(col
->rc_data
, buf
, col
->rc_size
);
426 ASSERT3P(buf
- (caddr_t
)rm
->rm_datacopy
, ==, size
);
429 static const zio_vsd_ops_t vdev_raidz_vsd_ops
= {
430 vdev_raidz_map_free_vsd
,
431 vdev_raidz_cksum_report
435 vdev_raidz_map_alloc(zio_t
*zio
, uint64_t unit_shift
, uint64_t dcols
,
439 uint64_t b
= zio
->io_offset
>> unit_shift
;
440 uint64_t s
= zio
->io_size
>> unit_shift
;
441 uint64_t f
= b
% dcols
;
442 uint64_t o
= (b
/ dcols
) << unit_shift
;
443 uint64_t q
, r
, c
, bc
, col
, acols
, scols
, coff
, devidx
, asize
, tot
;
445 q
= s
/ (dcols
- nparity
);
446 r
= s
- q
* (dcols
- nparity
);
447 bc
= (r
== 0 ? 0 : r
+ nparity
);
448 tot
= s
+ nparity
* (q
+ (r
== 0 ? 0 : 1));
452 scols
= MIN(dcols
, roundup(bc
, nparity
+ 1));
458 ASSERT3U(acols
, <=, scols
);
460 rm
= kmem_alloc(offsetof(raidz_map_t
, rm_col
[scols
]), KM_SLEEP
);
463 rm
->rm_scols
= scols
;
465 rm
->rm_skipstart
= bc
;
466 rm
->rm_missingdata
= 0;
467 rm
->rm_missingparity
= 0;
468 rm
->rm_firstdatacol
= nparity
;
469 rm
->rm_datacopy
= NULL
;
472 rm
->rm_ecksuminjected
= 0;
476 for (c
= 0; c
< scols
; c
++) {
481 coff
+= 1ULL << unit_shift
;
483 rm
->rm_col
[c
].rc_devidx
= col
;
484 rm
->rm_col
[c
].rc_offset
= coff
;
485 rm
->rm_col
[c
].rc_data
= NULL
;
486 rm
->rm_col
[c
].rc_gdata
= NULL
;
487 rm
->rm_col
[c
].rc_error
= 0;
488 rm
->rm_col
[c
].rc_tried
= 0;
489 rm
->rm_col
[c
].rc_skipped
= 0;
492 rm
->rm_col
[c
].rc_size
= 0;
494 rm
->rm_col
[c
].rc_size
= (q
+ 1) << unit_shift
;
496 rm
->rm_col
[c
].rc_size
= q
<< unit_shift
;
498 asize
+= rm
->rm_col
[c
].rc_size
;
501 ASSERT3U(asize
, ==, tot
<< unit_shift
);
502 rm
->rm_asize
= roundup(asize
, (nparity
+ 1) << unit_shift
);
503 rm
->rm_nskip
= roundup(tot
, nparity
+ 1) - tot
;
504 ASSERT3U(rm
->rm_asize
- asize
, ==, rm
->rm_nskip
<< unit_shift
);
505 ASSERT3U(rm
->rm_nskip
, <=, nparity
);
507 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++)
508 rm
->rm_col
[c
].rc_data
= zio_buf_alloc(rm
->rm_col
[c
].rc_size
);
510 rm
->rm_col
[c
].rc_data
= zio
->io_data
;
512 for (c
= c
+ 1; c
< acols
; c
++)
513 rm
->rm_col
[c
].rc_data
= (char *)rm
->rm_col
[c
- 1].rc_data
+
514 rm
->rm_col
[c
- 1].rc_size
;
517 * If all data stored spans all columns, there's a danger that parity
518 * will always be on the same device and, since parity isn't read
519 * during normal operation, that that device's I/O bandwidth won't be
520 * used effectively. We therefore switch the parity every 1MB.
522 * ... at least that was, ostensibly, the theory. As a practical
523 * matter unless we juggle the parity between all devices evenly, we
524 * won't see any benefit. Further, occasional writes that aren't a
525 * multiple of the LCM of the number of children and the minimum
526 * stripe width are sufficient to avoid pessimal behavior.
527 * Unfortunately, this decision created an implicit on-disk format
528 * requirement that we need to support for all eternity, but only
529 * for single-parity RAID-Z.
531 * If we intend to skip a sector in the zeroth column for padding
532 * we must make sure to note this swap. We will never intend to
533 * skip the first column since at least one data and one parity
534 * column must appear in each row.
536 ASSERT(rm
->rm_cols
>= 2);
537 ASSERT(rm
->rm_col
[0].rc_size
== rm
->rm_col
[1].rc_size
);
539 if (rm
->rm_firstdatacol
== 1 && (zio
->io_offset
& (1ULL << 20))) {
540 devidx
= rm
->rm_col
[0].rc_devidx
;
541 o
= rm
->rm_col
[0].rc_offset
;
542 rm
->rm_col
[0].rc_devidx
= rm
->rm_col
[1].rc_devidx
;
543 rm
->rm_col
[0].rc_offset
= rm
->rm_col
[1].rc_offset
;
544 rm
->rm_col
[1].rc_devidx
= devidx
;
545 rm
->rm_col
[1].rc_offset
= o
;
547 if (rm
->rm_skipstart
== 0)
548 rm
->rm_skipstart
= 1;
552 zio
->io_vsd_ops
= &vdev_raidz_vsd_ops
;
557 vdev_raidz_generate_parity_p(raidz_map_t
*rm
)
559 uint64_t *p
, *src
, pcount
, ccount
, i
;
562 pcount
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
564 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
565 src
= rm
->rm_col
[c
].rc_data
;
566 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
567 ccount
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
569 if (c
== rm
->rm_firstdatacol
) {
570 ASSERT(ccount
== pcount
);
571 for (i
= 0; i
< ccount
; i
++, src
++, p
++) {
575 ASSERT(ccount
<= pcount
);
576 for (i
= 0; i
< ccount
; i
++, src
++, p
++) {
584 vdev_raidz_generate_parity_pq(raidz_map_t
*rm
)
586 uint64_t *p
, *q
, *src
, pcnt
, ccnt
, mask
, i
;
589 pcnt
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
590 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
591 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
593 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
594 src
= rm
->rm_col
[c
].rc_data
;
595 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
596 q
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
598 ccnt
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
600 if (c
== rm
->rm_firstdatacol
) {
601 ASSERT(ccnt
== pcnt
|| ccnt
== 0);
602 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++) {
606 for (; i
< pcnt
; i
++, src
++, p
++, q
++) {
611 ASSERT(ccnt
<= pcnt
);
614 * Apply the algorithm described above by multiplying
615 * the previous result and adding in the new value.
617 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++) {
620 VDEV_RAIDZ_64MUL_2(*q
, mask
);
625 * Treat short columns as though they are full of 0s.
626 * Note that there's therefore nothing needed for P.
628 for (; i
< pcnt
; i
++, q
++) {
629 VDEV_RAIDZ_64MUL_2(*q
, mask
);
636 vdev_raidz_generate_parity_pqr(raidz_map_t
*rm
)
638 uint64_t *p
, *q
, *r
, *src
, pcnt
, ccnt
, mask
, i
;
641 pcnt
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
642 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
643 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
644 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
645 rm
->rm_col
[VDEV_RAIDZ_R
].rc_size
);
647 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
648 src
= rm
->rm_col
[c
].rc_data
;
649 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
650 q
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
651 r
= rm
->rm_col
[VDEV_RAIDZ_R
].rc_data
;
653 ccnt
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
655 if (c
== rm
->rm_firstdatacol
) {
656 ASSERT(ccnt
== pcnt
|| ccnt
== 0);
657 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++, r
++) {
662 for (; i
< pcnt
; i
++, src
++, p
++, q
++, r
++) {
668 ASSERT(ccnt
<= pcnt
);
671 * Apply the algorithm described above by multiplying
672 * the previous result and adding in the new value.
674 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++, r
++) {
677 VDEV_RAIDZ_64MUL_2(*q
, mask
);
680 VDEV_RAIDZ_64MUL_4(*r
, mask
);
685 * Treat short columns as though they are full of 0s.
686 * Note that there's therefore nothing needed for P.
688 for (; i
< pcnt
; i
++, q
++, r
++) {
689 VDEV_RAIDZ_64MUL_2(*q
, mask
);
690 VDEV_RAIDZ_64MUL_4(*r
, mask
);
697 * Generate RAID parity in the first virtual columns according to the number of
698 * parity columns available.
701 vdev_raidz_generate_parity(raidz_map_t
*rm
)
703 switch (rm
->rm_firstdatacol
) {
705 vdev_raidz_generate_parity_p(rm
);
708 vdev_raidz_generate_parity_pq(rm
);
711 vdev_raidz_generate_parity_pqr(rm
);
714 cmn_err(CE_PANIC
, "invalid RAID-Z configuration");
719 vdev_raidz_reconstruct_p(raidz_map_t
*rm
, int *tgts
, int ntgts
)
721 uint64_t *dst
, *src
, xcount
, ccount
, count
, i
;
726 ASSERT(x
>= rm
->rm_firstdatacol
);
727 ASSERT(x
< rm
->rm_cols
);
729 xcount
= rm
->rm_col
[x
].rc_size
/ sizeof (src
[0]);
730 ASSERT(xcount
<= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]));
733 src
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
734 dst
= rm
->rm_col
[x
].rc_data
;
735 for (i
= 0; i
< xcount
; i
++, dst
++, src
++) {
739 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
740 src
= rm
->rm_col
[c
].rc_data
;
741 dst
= rm
->rm_col
[x
].rc_data
;
746 ccount
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
747 count
= MIN(ccount
, xcount
);
749 for (i
= 0; i
< count
; i
++, dst
++, src
++) {
754 return (1 << VDEV_RAIDZ_P
);
758 vdev_raidz_reconstruct_q(raidz_map_t
*rm
, int *tgts
, int ntgts
)
760 uint64_t *dst
, *src
, xcount
, ccount
, count
, mask
, i
;
767 xcount
= rm
->rm_col
[x
].rc_size
/ sizeof (src
[0]);
768 ASSERT(xcount
<= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
/ sizeof (src
[0]));
770 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
771 src
= rm
->rm_col
[c
].rc_data
;
772 dst
= rm
->rm_col
[x
].rc_data
;
777 ccount
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
779 count
= MIN(ccount
, xcount
);
781 if (c
== rm
->rm_firstdatacol
) {
782 for (i
= 0; i
< count
; i
++, dst
++, src
++) {
785 for (; i
< xcount
; i
++, dst
++) {
790 for (i
= 0; i
< count
; i
++, dst
++, src
++) {
791 VDEV_RAIDZ_64MUL_2(*dst
, mask
);
795 for (; i
< xcount
; i
++, dst
++) {
796 VDEV_RAIDZ_64MUL_2(*dst
, mask
);
801 src
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
802 dst
= rm
->rm_col
[x
].rc_data
;
803 exp
= 255 - (rm
->rm_cols
- 1 - x
);
805 for (i
= 0; i
< xcount
; i
++, dst
++, src
++) {
807 for (j
= 0, b
= (uint8_t *)dst
; j
< 8; j
++, b
++) {
808 *b
= vdev_raidz_exp2(*b
, exp
);
812 return (1 << VDEV_RAIDZ_Q
);
816 vdev_raidz_reconstruct_pq(raidz_map_t
*rm
, int *tgts
, int ntgts
)
818 uint8_t *p
, *q
, *pxy
, *qxy
, *xd
, *yd
, tmp
, a
, b
, aexp
, bexp
;
820 uint64_t xsize
, ysize
, i
;
826 ASSERT(x
>= rm
->rm_firstdatacol
);
827 ASSERT(y
< rm
->rm_cols
);
829 ASSERT(rm
->rm_col
[x
].rc_size
>= rm
->rm_col
[y
].rc_size
);
832 * Move the parity data aside -- we're going to compute parity as
833 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
834 * reuse the parity generation mechanism without trashing the actual
835 * parity so we make those columns appear to be full of zeros by
836 * setting their lengths to zero.
838 pdata
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
839 qdata
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
840 xsize
= rm
->rm_col
[x
].rc_size
;
841 ysize
= rm
->rm_col
[y
].rc_size
;
843 rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
=
844 zio_buf_alloc(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
);
845 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
=
846 zio_buf_alloc(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
847 rm
->rm_col
[x
].rc_size
= 0;
848 rm
->rm_col
[y
].rc_size
= 0;
850 vdev_raidz_generate_parity_pq(rm
);
852 rm
->rm_col
[x
].rc_size
= xsize
;
853 rm
->rm_col
[y
].rc_size
= ysize
;
857 pxy
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
858 qxy
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
859 xd
= rm
->rm_col
[x
].rc_data
;
860 yd
= rm
->rm_col
[y
].rc_data
;
864 * Pxy = P + D_x + D_y
865 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
867 * We can then solve for D_x:
868 * D_x = A * (P + Pxy) + B * (Q + Qxy)
870 * A = 2^(x - y) * (2^(x - y) + 1)^-1
871 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
873 * With D_x in hand, we can easily solve for D_y:
874 * D_y = P + Pxy + D_x
877 a
= vdev_raidz_pow2
[255 + x
- y
];
878 b
= vdev_raidz_pow2
[255 - (rm
->rm_cols
- 1 - x
)];
879 tmp
= 255 - vdev_raidz_log2
[a
^ 1];
881 aexp
= vdev_raidz_log2
[vdev_raidz_exp2(a
, tmp
)];
882 bexp
= vdev_raidz_log2
[vdev_raidz_exp2(b
, tmp
)];
884 for (i
= 0; i
< xsize
; i
++, p
++, q
++, pxy
++, qxy
++, xd
++, yd
++) {
885 *xd
= vdev_raidz_exp2(*p
^ *pxy
, aexp
) ^
886 vdev_raidz_exp2(*q
^ *qxy
, bexp
);
889 *yd
= *p
^ *pxy
^ *xd
;
892 zio_buf_free(rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
,
893 rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
);
894 zio_buf_free(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
,
895 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
898 * Restore the saved parity data.
900 rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
= pdata
;
901 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
= qdata
;
903 return ((1 << VDEV_RAIDZ_P
) | (1 << VDEV_RAIDZ_Q
));
908 * In the general case of reconstruction, we must solve the system of linear
909 * equations defined by the coeffecients used to generate parity as well as
910 * the contents of the data and parity disks. This can be expressed with
911 * vectors for the original data (D) and the actual data (d) and parity (p)
912 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
916 * | V | | D_0 | | p_m-1 |
917 * | | x | : | = | d_0 |
918 * | I | | D_n-1 | | : |
919 * | | ~~ ~~ | d_n-1 |
922 * I is simply a square identity matrix of size n, and V is a vandermonde
923 * matrix defined by the coeffecients we chose for the various parity columns
924 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
925 * computation as well as linear separability.
928 * | 1 .. 1 1 1 | | p_0 |
929 * | 2^n-1 .. 4 2 1 | __ __ | : |
930 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
931 * | 1 .. 0 0 0 | | D_1 | | d_0 |
932 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
933 * | : : : : | | : | | d_2 |
934 * | 0 .. 1 0 0 | | D_n-1 | | : |
935 * | 0 .. 0 1 0 | ~~ ~~ | : |
936 * | 0 .. 0 0 1 | | d_n-1 |
939 * Note that I, V, d, and p are known. To compute D, we must invert the
940 * matrix and use the known data and parity values to reconstruct the unknown
941 * data values. We begin by removing the rows in V|I and d|p that correspond
942 * to failed or missing columns; we then make V|I square (n x n) and d|p
943 * sized n by removing rows corresponding to unused parity from the bottom up
944 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
945 * using Gauss-Jordan elimination. In the example below we use m=3 parity
946 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
948 * | 1 1 1 1 1 1 1 1 |
949 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
950 * | 19 205 116 29 64 16 4 1 | / /
951 * | 1 0 0 0 0 0 0 0 | / /
952 * | 0 1 0 0 0 0 0 0 | <--' /
953 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
954 * | 0 0 0 1 0 0 0 0 |
955 * | 0 0 0 0 1 0 0 0 |
956 * | 0 0 0 0 0 1 0 0 |
957 * | 0 0 0 0 0 0 1 0 |
958 * | 0 0 0 0 0 0 0 1 |
961 * | 1 1 1 1 1 1 1 1 |
962 * | 128 64 32 16 8 4 2 1 |
963 * | 19 205 116 29 64 16 4 1 |
964 * | 1 0 0 0 0 0 0 0 |
965 * | 0 1 0 0 0 0 0 0 |
966 * (V|I)' = | 0 0 1 0 0 0 0 0 |
967 * | 0 0 0 1 0 0 0 0 |
968 * | 0 0 0 0 1 0 0 0 |
969 * | 0 0 0 0 0 1 0 0 |
970 * | 0 0 0 0 0 0 1 0 |
971 * | 0 0 0 0 0 0 0 1 |
974 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
975 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
976 * matrix is not singular.
978 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
979 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
980 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
981 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
982 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
983 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
984 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
985 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
988 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
989 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
990 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
991 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
992 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
993 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
994 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
995 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
998 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
999 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1000 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1001 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1002 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1003 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1004 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1005 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1008 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1009 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1010 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1011 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1012 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1013 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1014 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1015 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1018 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1019 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1020 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1021 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1022 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1023 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1024 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1025 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1028 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1029 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1030 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1031 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1032 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1033 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1034 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1035 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1038 * | 0 0 1 0 0 0 0 0 |
1039 * | 167 100 5 41 159 169 217 208 |
1040 * | 166 100 4 40 158 168 216 209 |
1041 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1042 * | 0 0 0 0 1 0 0 0 |
1043 * | 0 0 0 0 0 1 0 0 |
1044 * | 0 0 0 0 0 0 1 0 |
1045 * | 0 0 0 0 0 0 0 1 |
1048 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1049 * of the missing data.
1051 * As is apparent from the example above, the only non-trivial rows in the
1052 * inverse matrix correspond to the data disks that we're trying to
1053 * reconstruct. Indeed, those are the only rows we need as the others would
1054 * only be useful for reconstructing data known or assumed to be valid. For
1055 * that reason, we only build the coefficients in the rows that correspond to
1061 vdev_raidz_matrix_init(raidz_map_t
*rm
, int n
, int nmap
, int *map
,
1067 ASSERT(n
== rm
->rm_cols
- rm
->rm_firstdatacol
);
1070 * Fill in the missing rows of interest.
1072 for (i
= 0; i
< nmap
; i
++) {
1073 ASSERT3S(0, <=, map
[i
]);
1074 ASSERT3S(map
[i
], <=, 2);
1081 for (j
= 0; j
< n
; j
++) {
1085 rows
[i
][j
] = vdev_raidz_pow2
[pow
];
1091 vdev_raidz_matrix_invert(raidz_map_t
*rm
, int n
, int nmissing
, int *missing
,
1092 uint8_t **rows
, uint8_t **invrows
, const uint8_t *used
)
1098 * Assert that the first nmissing entries from the array of used
1099 * columns correspond to parity columns and that subsequent entries
1100 * correspond to data columns.
1102 for (i
= 0; i
< nmissing
; i
++) {
1103 ASSERT3S(used
[i
], <, rm
->rm_firstdatacol
);
1105 for (; i
< n
; i
++) {
1106 ASSERT3S(used
[i
], >=, rm
->rm_firstdatacol
);
1110 * First initialize the storage where we'll compute the inverse rows.
1112 for (i
= 0; i
< nmissing
; i
++) {
1113 for (j
= 0; j
< n
; j
++) {
1114 invrows
[i
][j
] = (i
== j
) ? 1 : 0;
1119 * Subtract all trivial rows from the rows of consequence.
1121 for (i
= 0; i
< nmissing
; i
++) {
1122 for (j
= nmissing
; j
< n
; j
++) {
1123 ASSERT3U(used
[j
], >=, rm
->rm_firstdatacol
);
1124 jj
= used
[j
] - rm
->rm_firstdatacol
;
1126 invrows
[i
][j
] = rows
[i
][jj
];
1132 * For each of the rows of interest, we must normalize it and subtract
1133 * a multiple of it from the other rows.
1135 for (i
= 0; i
< nmissing
; i
++) {
1136 for (j
= 0; j
< missing
[i
]; j
++) {
1137 ASSERT3U(rows
[i
][j
], ==, 0);
1139 ASSERT3U(rows
[i
][missing
[i
]], !=, 0);
1142 * Compute the inverse of the first element and multiply each
1143 * element in the row by that value.
1145 log
= 255 - vdev_raidz_log2
[rows
[i
][missing
[i
]]];
1147 for (j
= 0; j
< n
; j
++) {
1148 rows
[i
][j
] = vdev_raidz_exp2(rows
[i
][j
], log
);
1149 invrows
[i
][j
] = vdev_raidz_exp2(invrows
[i
][j
], log
);
1152 for (ii
= 0; ii
< nmissing
; ii
++) {
1156 ASSERT3U(rows
[ii
][missing
[i
]], !=, 0);
1158 log
= vdev_raidz_log2
[rows
[ii
][missing
[i
]]];
1160 for (j
= 0; j
< n
; j
++) {
1162 vdev_raidz_exp2(rows
[i
][j
], log
);
1164 vdev_raidz_exp2(invrows
[i
][j
], log
);
1170 * Verify that the data that is left in the rows are properly part of
1171 * an identity matrix.
1173 for (i
= 0; i
< nmissing
; i
++) {
1174 for (j
= 0; j
< n
; j
++) {
1175 if (j
== missing
[i
]) {
1176 ASSERT3U(rows
[i
][j
], ==, 1);
1178 ASSERT3U(rows
[i
][j
], ==, 0);
1185 vdev_raidz_matrix_reconstruct(raidz_map_t
*rm
, int n
, int nmissing
,
1186 int *missing
, uint8_t **invrows
, const uint8_t *used
)
1191 uint8_t *dst
[VDEV_RAIDZ_MAXPARITY
];
1192 uint64_t dcount
[VDEV_RAIDZ_MAXPARITY
];
1195 uint8_t *invlog
[VDEV_RAIDZ_MAXPARITY
];
1199 psize
= sizeof (invlog
[0][0]) * n
* nmissing
;
1200 p
= kmem_alloc(psize
, KM_SLEEP
);
1202 for (pp
= p
, i
= 0; i
< nmissing
; i
++) {
1207 for (i
= 0; i
< nmissing
; i
++) {
1208 for (j
= 0; j
< n
; j
++) {
1209 ASSERT3U(invrows
[i
][j
], !=, 0);
1210 invlog
[i
][j
] = vdev_raidz_log2
[invrows
[i
][j
]];
1214 for (i
= 0; i
< n
; i
++) {
1216 ASSERT3U(c
, <, rm
->rm_cols
);
1218 src
= rm
->rm_col
[c
].rc_data
;
1219 ccount
= rm
->rm_col
[c
].rc_size
;
1220 for (j
= 0; j
< nmissing
; j
++) {
1221 cc
= missing
[j
] + rm
->rm_firstdatacol
;
1222 ASSERT3U(cc
, >=, rm
->rm_firstdatacol
);
1223 ASSERT3U(cc
, <, rm
->rm_cols
);
1224 ASSERT3U(cc
, !=, c
);
1226 dst
[j
] = rm
->rm_col
[cc
].rc_data
;
1227 dcount
[j
] = rm
->rm_col
[cc
].rc_size
;
1230 ASSERT(ccount
>= rm
->rm_col
[missing
[0]].rc_size
|| i
> 0);
1232 for (x
= 0; x
< ccount
; x
++, src
++) {
1234 log
= vdev_raidz_log2
[*src
];
1236 for (cc
= 0; cc
< nmissing
; cc
++) {
1237 if (x
>= dcount
[cc
])
1243 if ((ll
= log
+ invlog
[cc
][i
]) >= 255)
1245 val
= vdev_raidz_pow2
[ll
];
1256 kmem_free(p
, psize
);
1260 vdev_raidz_reconstruct_general(raidz_map_t
*rm
, int *tgts
, int ntgts
)
1264 int missing_rows
[VDEV_RAIDZ_MAXPARITY
];
1265 int parity_map
[VDEV_RAIDZ_MAXPARITY
];
1270 uint8_t *rows
[VDEV_RAIDZ_MAXPARITY
];
1271 uint8_t *invrows
[VDEV_RAIDZ_MAXPARITY
];
1277 n
= rm
->rm_cols
- rm
->rm_firstdatacol
;
1280 * Figure out which data columns are missing.
1283 for (t
= 0; t
< ntgts
; t
++) {
1284 if (tgts
[t
] >= rm
->rm_firstdatacol
) {
1285 missing_rows
[nmissing_rows
++] =
1286 tgts
[t
] - rm
->rm_firstdatacol
;
1291 * Figure out which parity columns to use to help generate the missing
1294 for (tt
= 0, c
= 0, i
= 0; i
< nmissing_rows
; c
++) {
1296 ASSERT(c
< rm
->rm_firstdatacol
);
1299 * Skip any targeted parity columns.
1301 if (c
== tgts
[tt
]) {
1313 ASSERT3U(code
, <, 1 << VDEV_RAIDZ_MAXPARITY
);
1315 psize
= (sizeof (rows
[0][0]) + sizeof (invrows
[0][0])) *
1316 nmissing_rows
* n
+ sizeof (used
[0]) * n
;
1317 p
= kmem_alloc(psize
, KM_SLEEP
);
1319 for (pp
= p
, i
= 0; i
< nmissing_rows
; i
++) {
1327 for (i
= 0; i
< nmissing_rows
; i
++) {
1328 used
[i
] = parity_map
[i
];
1331 for (tt
= 0, c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
1332 if (tt
< nmissing_rows
&&
1333 c
== missing_rows
[tt
] + rm
->rm_firstdatacol
) {
1344 * Initialize the interesting rows of the matrix.
1346 vdev_raidz_matrix_init(rm
, n
, nmissing_rows
, parity_map
, rows
);
1349 * Invert the matrix.
1351 vdev_raidz_matrix_invert(rm
, n
, nmissing_rows
, missing_rows
, rows
,
1355 * Reconstruct the missing data using the generated matrix.
1357 vdev_raidz_matrix_reconstruct(rm
, n
, nmissing_rows
, missing_rows
,
1360 kmem_free(p
, psize
);
1366 vdev_raidz_reconstruct(raidz_map_t
*rm
, int *t
, int nt
)
1368 int tgts
[VDEV_RAIDZ_MAXPARITY
], *dt
;
1372 int nbadparity
, nbaddata
;
1373 int parity_valid
[VDEV_RAIDZ_MAXPARITY
];
1376 * The tgts list must already be sorted.
1378 for (i
= 1; i
< nt
; i
++) {
1379 ASSERT(t
[i
] > t
[i
- 1]);
1382 nbadparity
= rm
->rm_firstdatacol
;
1383 nbaddata
= rm
->rm_cols
- nbadparity
;
1385 for (i
= 0, c
= 0; c
< rm
->rm_cols
; c
++) {
1386 if (c
< rm
->rm_firstdatacol
)
1387 parity_valid
[c
] = B_FALSE
;
1389 if (i
< nt
&& c
== t
[i
]) {
1392 } else if (rm
->rm_col
[c
].rc_error
!= 0) {
1394 } else if (c
>= rm
->rm_firstdatacol
) {
1397 parity_valid
[c
] = B_TRUE
;
1402 ASSERT(ntgts
>= nt
);
1403 ASSERT(nbaddata
>= 0);
1404 ASSERT(nbaddata
+ nbadparity
== ntgts
);
1406 dt
= &tgts
[nbadparity
];
1409 * See if we can use any of our optimized reconstruction routines.
1411 if (!vdev_raidz_default_to_general
) {
1414 if (parity_valid
[VDEV_RAIDZ_P
])
1415 return (vdev_raidz_reconstruct_p(rm
, dt
, 1));
1417 ASSERT(rm
->rm_firstdatacol
> 1);
1419 if (parity_valid
[VDEV_RAIDZ_Q
])
1420 return (vdev_raidz_reconstruct_q(rm
, dt
, 1));
1422 ASSERT(rm
->rm_firstdatacol
> 2);
1426 ASSERT(rm
->rm_firstdatacol
> 1);
1428 if (parity_valid
[VDEV_RAIDZ_P
] &&
1429 parity_valid
[VDEV_RAIDZ_Q
])
1430 return (vdev_raidz_reconstruct_pq(rm
, dt
, 2));
1432 ASSERT(rm
->rm_firstdatacol
> 2);
1438 code
= vdev_raidz_reconstruct_general(rm
, tgts
, ntgts
);
1439 ASSERT(code
< (1 << VDEV_RAIDZ_MAXPARITY
));
1445 vdev_raidz_open(vdev_t
*vd
, uint64_t *asize
, uint64_t *max_asize
,
1449 uint64_t nparity
= vd
->vdev_nparity
;
1454 ASSERT(nparity
> 0);
1456 if (nparity
> VDEV_RAIDZ_MAXPARITY
||
1457 vd
->vdev_children
< nparity
+ 1) {
1458 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
1462 vdev_open_children(vd
);
1464 for (c
= 0; c
< vd
->vdev_children
; c
++) {
1465 cvd
= vd
->vdev_child
[c
];
1467 if (cvd
->vdev_open_error
!= 0) {
1468 lasterror
= cvd
->vdev_open_error
;
1473 *asize
= MIN(*asize
- 1, cvd
->vdev_asize
- 1) + 1;
1474 *max_asize
= MIN(*max_asize
- 1, cvd
->vdev_max_asize
- 1) + 1;
1475 *ashift
= MAX(*ashift
, cvd
->vdev_ashift
);
1478 *asize
*= vd
->vdev_children
;
1479 *max_asize
*= vd
->vdev_children
;
1481 if (numerrors
> nparity
) {
1482 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NO_REPLICAS
;
1490 vdev_raidz_close(vdev_t
*vd
)
1494 for (c
= 0; c
< vd
->vdev_children
; c
++)
1495 vdev_close(vd
->vdev_child
[c
]);
1499 vdev_raidz_asize(vdev_t
*vd
, uint64_t psize
)
1502 uint64_t ashift
= vd
->vdev_top
->vdev_ashift
;
1503 uint64_t cols
= vd
->vdev_children
;
1504 uint64_t nparity
= vd
->vdev_nparity
;
1506 asize
= ((psize
- 1) >> ashift
) + 1;
1507 asize
+= nparity
* ((asize
+ cols
- nparity
- 1) / (cols
- nparity
));
1508 asize
= roundup(asize
, nparity
+ 1) << ashift
;
1514 vdev_raidz_child_done(zio_t
*zio
)
1516 raidz_col_t
*rc
= zio
->io_private
;
1518 rc
->rc_error
= zio
->io_error
;
1524 vdev_raidz_io_start(zio_t
*zio
)
1526 vdev_t
*vd
= zio
->io_vd
;
1527 vdev_t
*tvd
= vd
->vdev_top
;
1533 rm
= vdev_raidz_map_alloc(zio
, tvd
->vdev_ashift
, vd
->vdev_children
,
1536 ASSERT3U(rm
->rm_asize
, ==, vdev_psize_to_asize(vd
, zio
->io_size
));
1538 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
1539 vdev_raidz_generate_parity(rm
);
1541 for (c
= 0; c
< rm
->rm_cols
; c
++) {
1542 rc
= &rm
->rm_col
[c
];
1543 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1544 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1545 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
1546 zio
->io_type
, zio
->io_priority
, 0,
1547 vdev_raidz_child_done
, rc
));
1551 * Generate optional I/Os for any skipped sectors to improve
1552 * aggregation contiguity.
1554 for (c
= rm
->rm_skipstart
, i
= 0; i
< rm
->rm_nskip
; c
++, i
++) {
1555 ASSERT(c
<= rm
->rm_scols
);
1556 if (c
== rm
->rm_scols
)
1558 rc
= &rm
->rm_col
[c
];
1559 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1560 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1561 rc
->rc_offset
+ rc
->rc_size
, NULL
,
1562 1 << tvd
->vdev_ashift
,
1563 zio
->io_type
, zio
->io_priority
,
1564 ZIO_FLAG_NODATA
| ZIO_FLAG_OPTIONAL
, NULL
, NULL
));
1567 return (ZIO_PIPELINE_CONTINUE
);
1570 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
1573 * Iterate over the columns in reverse order so that we hit the parity
1574 * last -- any errors along the way will force us to read the parity.
1576 for (c
= rm
->rm_cols
- 1; c
>= 0; c
--) {
1577 rc
= &rm
->rm_col
[c
];
1578 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1579 if (!vdev_readable(cvd
)) {
1580 if (c
>= rm
->rm_firstdatacol
)
1581 rm
->rm_missingdata
++;
1583 rm
->rm_missingparity
++;
1584 rc
->rc_error
= ENXIO
;
1585 rc
->rc_tried
= 1; /* don't even try */
1589 if (vdev_dtl_contains(cvd
, DTL_MISSING
, zio
->io_txg
, 1)) {
1590 if (c
>= rm
->rm_firstdatacol
)
1591 rm
->rm_missingdata
++;
1593 rm
->rm_missingparity
++;
1594 rc
->rc_error
= ESTALE
;
1598 if (c
>= rm
->rm_firstdatacol
|| rm
->rm_missingdata
> 0 ||
1599 (zio
->io_flags
& (ZIO_FLAG_SCRUB
| ZIO_FLAG_RESILVER
))) {
1600 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1601 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
1602 zio
->io_type
, zio
->io_priority
, 0,
1603 vdev_raidz_child_done
, rc
));
1607 return (ZIO_PIPELINE_CONTINUE
);
1612 * Report a checksum error for a child of a RAID-Z device.
1615 raidz_checksum_error(zio_t
*zio
, raidz_col_t
*rc
, void *bad_data
)
1617 vdev_t
*vd
= zio
->io_vd
->vdev_child
[rc
->rc_devidx
];
1619 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
1620 zio_bad_cksum_t zbc
;
1621 raidz_map_t
*rm
= zio
->io_vsd
;
1623 mutex_enter(&vd
->vdev_stat_lock
);
1624 vd
->vdev_stat
.vs_checksum_errors
++;
1625 mutex_exit(&vd
->vdev_stat_lock
);
1627 zbc
.zbc_has_cksum
= 0;
1628 zbc
.zbc_injected
= rm
->rm_ecksuminjected
;
1630 zfs_ereport_post_checksum(zio
->io_spa
, vd
, zio
,
1631 rc
->rc_offset
, rc
->rc_size
, rc
->rc_data
, bad_data
,
1637 * We keep track of whether or not there were any injected errors, so that
1638 * any ereports we generate can note it.
1641 raidz_checksum_verify(zio_t
*zio
)
1643 zio_bad_cksum_t zbc
;
1644 raidz_map_t
*rm
= zio
->io_vsd
;
1646 int ret
= zio_checksum_error(zio
, &zbc
);
1647 if (ret
!= 0 && zbc
.zbc_injected
!= 0)
1648 rm
->rm_ecksuminjected
= 1;
1654 * Generate the parity from the data columns. If we tried and were able to
1655 * read the parity without error, verify that the generated parity matches the
1656 * data we read. If it doesn't, we fire off a checksum error. Return the
1657 * number such failures.
1660 raidz_parity_verify(zio_t
*zio
, raidz_map_t
*rm
)
1662 void *orig
[VDEV_RAIDZ_MAXPARITY
];
1666 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
1667 rc
= &rm
->rm_col
[c
];
1668 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
1670 orig
[c
] = zio_buf_alloc(rc
->rc_size
);
1671 bcopy(rc
->rc_data
, orig
[c
], rc
->rc_size
);
1674 vdev_raidz_generate_parity(rm
);
1676 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
1677 rc
= &rm
->rm_col
[c
];
1678 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
1680 if (bcmp(orig
[c
], rc
->rc_data
, rc
->rc_size
) != 0) {
1681 raidz_checksum_error(zio
, rc
, orig
[c
]);
1682 rc
->rc_error
= ECKSUM
;
1685 zio_buf_free(orig
[c
], rc
->rc_size
);
1692 * Keep statistics on all the ways that we used parity to correct data.
1694 static uint64_t raidz_corrected
[1 << VDEV_RAIDZ_MAXPARITY
];
1697 vdev_raidz_worst_error(raidz_map_t
*rm
)
1701 for (int c
= 0; c
< rm
->rm_cols
; c
++)
1702 error
= zio_worst_error(error
, rm
->rm_col
[c
].rc_error
);
1708 * Iterate over all combinations of bad data and attempt a reconstruction.
1709 * Note that the algorithm below is non-optimal because it doesn't take into
1710 * account how reconstruction is actually performed. For example, with
1711 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1712 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1713 * cases we'd only use parity information in column 0.
1716 vdev_raidz_combrec(zio_t
*zio
, int total_errors
, int data_errors
)
1718 raidz_map_t
*rm
= zio
->io_vsd
;
1720 void *orig
[VDEV_RAIDZ_MAXPARITY
];
1721 int tstore
[VDEV_RAIDZ_MAXPARITY
+ 2];
1722 int *tgts
= &tstore
[1];
1723 int current
, next
, i
, c
, n
;
1726 ASSERT(total_errors
< rm
->rm_firstdatacol
);
1729 * This simplifies one edge condition.
1733 for (n
= 1; n
<= rm
->rm_firstdatacol
- total_errors
; n
++) {
1735 * Initialize the targets array by finding the first n columns
1736 * that contain no error.
1738 * If there were no data errors, we need to ensure that we're
1739 * always explicitly attempting to reconstruct at least one
1740 * data column. To do this, we simply push the highest target
1741 * up into the data columns.
1743 for (c
= 0, i
= 0; i
< n
; i
++) {
1744 if (i
== n
- 1 && data_errors
== 0 &&
1745 c
< rm
->rm_firstdatacol
) {
1746 c
= rm
->rm_firstdatacol
;
1749 while (rm
->rm_col
[c
].rc_error
!= 0) {
1751 ASSERT3S(c
, <, rm
->rm_cols
);
1758 * Setting tgts[n] simplifies the other edge condition.
1760 tgts
[n
] = rm
->rm_cols
;
1763 * These buffers were allocated in previous iterations.
1765 for (i
= 0; i
< n
- 1; i
++) {
1766 ASSERT(orig
[i
] != NULL
);
1769 orig
[n
- 1] = zio_buf_alloc(rm
->rm_col
[0].rc_size
);
1772 next
= tgts
[current
];
1774 while (current
!= n
) {
1775 tgts
[current
] = next
;
1779 * Save off the original data that we're going to
1780 * attempt to reconstruct.
1782 for (i
= 0; i
< n
; i
++) {
1783 ASSERT(orig
[i
] != NULL
);
1786 ASSERT3S(c
, <, rm
->rm_cols
);
1787 rc
= &rm
->rm_col
[c
];
1788 bcopy(rc
->rc_data
, orig
[i
], rc
->rc_size
);
1792 * Attempt a reconstruction and exit the outer loop on
1795 code
= vdev_raidz_reconstruct(rm
, tgts
, n
);
1796 if (raidz_checksum_verify(zio
) == 0) {
1797 atomic_inc_64(&raidz_corrected
[code
]);
1799 for (i
= 0; i
< n
; i
++) {
1801 rc
= &rm
->rm_col
[c
];
1802 ASSERT(rc
->rc_error
== 0);
1804 raidz_checksum_error(zio
, rc
,
1806 rc
->rc_error
= ECKSUM
;
1814 * Restore the original data.
1816 for (i
= 0; i
< n
; i
++) {
1818 rc
= &rm
->rm_col
[c
];
1819 bcopy(orig
[i
], rc
->rc_data
, rc
->rc_size
);
1824 * Find the next valid column after the current
1827 for (next
= tgts
[current
] + 1;
1828 next
< rm
->rm_cols
&&
1829 rm
->rm_col
[next
].rc_error
!= 0; next
++)
1832 ASSERT(next
<= tgts
[current
+ 1]);
1835 * If that spot is available, we're done here.
1837 if (next
!= tgts
[current
+ 1])
1841 * Otherwise, find the next valid column after
1842 * the previous position.
1844 for (c
= tgts
[current
- 1] + 1;
1845 rm
->rm_col
[c
].rc_error
!= 0; c
++)
1851 } while (current
!= n
);
1856 for (i
= 0; i
< n
; i
++) {
1857 zio_buf_free(orig
[i
], rm
->rm_col
[0].rc_size
);
1864 vdev_raidz_io_done(zio_t
*zio
)
1866 vdev_t
*vd
= zio
->io_vd
;
1868 raidz_map_t
*rm
= zio
->io_vsd
;
1870 int unexpected_errors
= 0;
1871 int parity_errors
= 0;
1872 int parity_untried
= 0;
1873 int data_errors
= 0;
1874 int total_errors
= 0;
1876 int tgts
[VDEV_RAIDZ_MAXPARITY
];
1879 ASSERT(zio
->io_bp
!= NULL
); /* XXX need to add code to enforce this */
1881 ASSERT(rm
->rm_missingparity
<= rm
->rm_firstdatacol
);
1882 ASSERT(rm
->rm_missingdata
<= rm
->rm_cols
- rm
->rm_firstdatacol
);
1884 for (c
= 0; c
< rm
->rm_cols
; c
++) {
1885 rc
= &rm
->rm_col
[c
];
1888 ASSERT(rc
->rc_error
!= ECKSUM
); /* child has no bp */
1890 if (c
< rm
->rm_firstdatacol
)
1895 if (!rc
->rc_skipped
)
1896 unexpected_errors
++;
1899 } else if (c
< rm
->rm_firstdatacol
&& !rc
->rc_tried
) {
1904 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
1906 * XXX -- for now, treat partial writes as a success.
1907 * (If we couldn't write enough columns to reconstruct
1908 * the data, the I/O failed. Otherwise, good enough.)
1910 * Now that we support write reallocation, it would be better
1911 * to treat partial failure as real failure unless there are
1912 * no non-degraded top-level vdevs left, and not update DTLs
1913 * if we intend to reallocate.
1916 if (total_errors
> rm
->rm_firstdatacol
)
1917 zio
->io_error
= vdev_raidz_worst_error(rm
);
1922 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
1924 * There are three potential phases for a read:
1925 * 1. produce valid data from the columns read
1926 * 2. read all disks and try again
1927 * 3. perform combinatorial reconstruction
1929 * Each phase is progressively both more expensive and less likely to
1930 * occur. If we encounter more errors than we can repair or all phases
1931 * fail, we have no choice but to return an error.
1935 * If the number of errors we saw was correctable -- less than or equal
1936 * to the number of parity disks read -- attempt to produce data that
1937 * has a valid checksum. Naturally, this case applies in the absence of
1940 if (total_errors
<= rm
->rm_firstdatacol
- parity_untried
) {
1941 if (data_errors
== 0) {
1942 if (raidz_checksum_verify(zio
) == 0) {
1944 * If we read parity information (unnecessarily
1945 * as it happens since no reconstruction was
1946 * needed) regenerate and verify the parity.
1947 * We also regenerate parity when resilvering
1948 * so we can write it out to the failed device
1951 if (parity_errors
+ parity_untried
<
1952 rm
->rm_firstdatacol
||
1953 (zio
->io_flags
& ZIO_FLAG_RESILVER
)) {
1954 n
= raidz_parity_verify(zio
, rm
);
1955 unexpected_errors
+= n
;
1956 ASSERT(parity_errors
+ n
<=
1957 rm
->rm_firstdatacol
);
1963 * We either attempt to read all the parity columns or
1964 * none of them. If we didn't try to read parity, we
1965 * wouldn't be here in the correctable case. There must
1966 * also have been fewer parity errors than parity
1967 * columns or, again, we wouldn't be in this code path.
1969 ASSERT(parity_untried
== 0);
1970 ASSERT(parity_errors
< rm
->rm_firstdatacol
);
1973 * Identify the data columns that reported an error.
1976 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
1977 rc
= &rm
->rm_col
[c
];
1978 if (rc
->rc_error
!= 0) {
1979 ASSERT(n
< VDEV_RAIDZ_MAXPARITY
);
1984 ASSERT(rm
->rm_firstdatacol
>= n
);
1986 code
= vdev_raidz_reconstruct(rm
, tgts
, n
);
1988 if (raidz_checksum_verify(zio
) == 0) {
1989 atomic_inc_64(&raidz_corrected
[code
]);
1992 * If we read more parity disks than were used
1993 * for reconstruction, confirm that the other
1994 * parity disks produced correct data. This
1995 * routine is suboptimal in that it regenerates
1996 * the parity that we already used in addition
1997 * to the parity that we're attempting to
1998 * verify, but this should be a relatively
1999 * uncommon case, and can be optimized if it
2000 * becomes a problem. Note that we regenerate
2001 * parity when resilvering so we can write it
2002 * out to failed devices later.
2004 if (parity_errors
< rm
->rm_firstdatacol
- n
||
2005 (zio
->io_flags
& ZIO_FLAG_RESILVER
)) {
2006 n
= raidz_parity_verify(zio
, rm
);
2007 unexpected_errors
+= n
;
2008 ASSERT(parity_errors
+ n
<=
2009 rm
->rm_firstdatacol
);
2018 * This isn't a typical situation -- either we got a read error or
2019 * a child silently returned bad data. Read every block so we can
2020 * try again with as much data and parity as we can track down. If
2021 * we've already been through once before, all children will be marked
2022 * as tried so we'll proceed to combinatorial reconstruction.
2024 unexpected_errors
= 1;
2025 rm
->rm_missingdata
= 0;
2026 rm
->rm_missingparity
= 0;
2028 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2029 if (rm
->rm_col
[c
].rc_tried
)
2032 zio_vdev_io_redone(zio
);
2034 rc
= &rm
->rm_col
[c
];
2037 zio_nowait(zio_vdev_child_io(zio
, NULL
,
2038 vd
->vdev_child
[rc
->rc_devidx
],
2039 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
2040 zio
->io_type
, zio
->io_priority
, 0,
2041 vdev_raidz_child_done
, rc
));
2042 } while (++c
< rm
->rm_cols
);
2048 * At this point we've attempted to reconstruct the data given the
2049 * errors we detected, and we've attempted to read all columns. There
2050 * must, therefore, be one or more additional problems -- silent errors
2051 * resulting in invalid data rather than explicit I/O errors resulting
2052 * in absent data. We check if there is enough additional data to
2053 * possibly reconstruct the data and then perform combinatorial
2054 * reconstruction over all possible combinations. If that fails,
2057 if (total_errors
> rm
->rm_firstdatacol
) {
2058 zio
->io_error
= vdev_raidz_worst_error(rm
);
2060 } else if (total_errors
< rm
->rm_firstdatacol
&&
2061 (code
= vdev_raidz_combrec(zio
, total_errors
, data_errors
)) != 0) {
2063 * If we didn't use all the available parity for the
2064 * combinatorial reconstruction, verify that the remaining
2065 * parity is correct.
2067 if (code
!= (1 << rm
->rm_firstdatacol
) - 1)
2068 (void) raidz_parity_verify(zio
, rm
);
2071 * We're here because either:
2073 * total_errors == rm_first_datacol, or
2074 * vdev_raidz_combrec() failed
2076 * In either case, there is enough bad data to prevent
2079 * Start checksum ereports for all children which haven't
2080 * failed, and the IO wasn't speculative.
2082 zio
->io_error
= ECKSUM
;
2084 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
2085 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2086 rc
= &rm
->rm_col
[c
];
2087 if (rc
->rc_error
== 0) {
2088 zio_bad_cksum_t zbc
;
2089 zbc
.zbc_has_cksum
= 0;
2091 rm
->rm_ecksuminjected
;
2093 zfs_ereport_start_checksum(
2095 vd
->vdev_child
[rc
->rc_devidx
],
2096 zio
, rc
->rc_offset
, rc
->rc_size
,
2097 (void *)(uintptr_t)c
, &zbc
);
2104 zio_checksum_verified(zio
);
2106 if (zio
->io_error
== 0 && spa_writeable(zio
->io_spa
) &&
2107 (unexpected_errors
|| (zio
->io_flags
& ZIO_FLAG_RESILVER
))) {
2109 * Use the good data we have in hand to repair damaged children.
2111 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2112 rc
= &rm
->rm_col
[c
];
2113 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
2115 if (rc
->rc_error
== 0)
2118 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
2119 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
2120 ZIO_TYPE_WRITE
, zio
->io_priority
,
2121 ZIO_FLAG_IO_REPAIR
| (unexpected_errors
?
2122 ZIO_FLAG_SELF_HEAL
: 0), NULL
, NULL
));
2128 vdev_raidz_state_change(vdev_t
*vd
, int faulted
, int degraded
)
2130 if (faulted
> vd
->vdev_nparity
)
2131 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
2132 VDEV_AUX_NO_REPLICAS
);
2133 else if (degraded
+ faulted
!= 0)
2134 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
, VDEV_AUX_NONE
);
2136 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_HEALTHY
, VDEV_AUX_NONE
);
2139 vdev_ops_t vdev_raidz_ops
= {
2143 vdev_raidz_io_start
,
2145 vdev_raidz_state_change
,
2148 VDEV_TYPE_RAIDZ
, /* name of this vdev type */
2149 B_FALSE
/* not a leaf vdev */