Merge commit '720b16875295d57e0e6a4e0ec32db4d47412f896'
[unleashed.git] / kernel / fs / zfs / vdev_raidz.c
blob109534f529638b7303565de5c43805bc5abf371b
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_disk.h>
33 #include <sys/vdev_file.h>
34 #include <sys/vdev_raidz.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/abd.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/fm/fs/zfs.h>
42 * Virtual device vector for RAID-Z.
44 * This vdev supports single, double, and triple parity. For single parity,
45 * we use a simple XOR of all the data columns. For double or triple parity,
46 * we use a special case of Reed-Solomon coding. This extends the
47 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
48 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
49 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
50 * former is also based. The latter is designed to provide higher performance
51 * for writes.
53 * Note that the Plank paper claimed to support arbitrary N+M, but was then
54 * amended six years later identifying a critical flaw that invalidates its
55 * claims. Nevertheless, the technique can be adapted to work for up to
56 * triple parity. For additional parity, the amendment "Note: Correction to
57 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
58 * is viable, but the additional complexity means that write performance will
59 * suffer.
61 * All of the methods above operate on a Galois field, defined over the
62 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
63 * can be expressed with a single byte. Briefly, the operations on the
64 * field are defined as follows:
66 * o addition (+) is represented by a bitwise XOR
67 * o subtraction (-) is therefore identical to addition: A + B = A - B
68 * o multiplication of A by 2 is defined by the following bitwise expression:
70 * (A * 2)_7 = A_6
71 * (A * 2)_6 = A_5
72 * (A * 2)_5 = A_4
73 * (A * 2)_4 = A_3 + A_7
74 * (A * 2)_3 = A_2 + A_7
75 * (A * 2)_2 = A_1 + A_7
76 * (A * 2)_1 = A_0
77 * (A * 2)_0 = A_7
79 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
80 * As an aside, this multiplication is derived from the error correcting
81 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
83 * Observe that any number in the field (except for 0) can be expressed as a
84 * power of 2 -- a generator for the field. We store a table of the powers of
85 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
86 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
87 * than field addition). The inverse of a field element A (A^-1) is therefore
88 * A ^ (255 - 1) = A^254.
90 * The up-to-three parity columns, P, Q, R over several data columns,
91 * D_0, ... D_n-1, can be expressed by field operations:
93 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
94 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
95 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
96 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
97 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
99 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
100 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
101 * independent coefficients. (There are no additional coefficients that have
102 * this property which is why the uncorrected Plank method breaks down.)
104 * See the reconstruction code below for how P, Q and R can used individually
105 * or in concert to recover missing data columns.
108 typedef struct raidz_col {
109 uint64_t rc_devidx; /* child device index for I/O */
110 uint64_t rc_offset; /* device offset */
111 uint64_t rc_size; /* I/O size */
112 abd_t *rc_abd; /* I/O data */
113 void *rc_gdata; /* used to store the "good" version */
114 int rc_error; /* I/O error for this device */
115 uint8_t rc_tried; /* Did we attempt this I/O column? */
116 uint8_t rc_skipped; /* Did we skip this I/O column? */
117 } raidz_col_t;
119 typedef struct raidz_map {
120 uint64_t rm_cols; /* Regular column count */
121 uint64_t rm_scols; /* Count including skipped columns */
122 uint64_t rm_bigcols; /* Number of oversized columns */
123 uint64_t rm_asize; /* Actual total I/O size */
124 uint64_t rm_missingdata; /* Count of missing data devices */
125 uint64_t rm_missingparity; /* Count of missing parity devices */
126 uint64_t rm_firstdatacol; /* First data column/parity count */
127 uint64_t rm_nskip; /* Skipped sectors for padding */
128 uint64_t rm_skipstart; /* Column index of padding start */
129 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
130 uintptr_t rm_reports; /* # of referencing checksum reports */
131 uint8_t rm_freed; /* map no longer has referencing ZIO */
132 uint8_t rm_ecksuminjected; /* checksum error was injected */
133 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
134 } raidz_map_t;
136 #define VDEV_RAIDZ_P 0
137 #define VDEV_RAIDZ_Q 1
138 #define VDEV_RAIDZ_R 2
140 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
141 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
144 * We provide a mechanism to perform the field multiplication operation on a
145 * 64-bit value all at once rather than a byte at a time. This works by
146 * creating a mask from the top bit in each byte and using that to
147 * conditionally apply the XOR of 0x1d.
149 #define VDEV_RAIDZ_64MUL_2(x, mask) \
151 (mask) = (x) & 0x8080808080808080ULL; \
152 (mask) = ((mask) << 1) - ((mask) >> 7); \
153 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
154 ((mask) & 0x1d1d1d1d1d1d1d1d); \
157 #define VDEV_RAIDZ_64MUL_4(x, mask) \
159 VDEV_RAIDZ_64MUL_2((x), mask); \
160 VDEV_RAIDZ_64MUL_2((x), mask); \
163 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
166 * Force reconstruction to use the general purpose method.
168 int vdev_raidz_default_to_general;
170 /* Powers of 2 in the Galois field defined above. */
171 static const uint8_t vdev_raidz_pow2[256] = {
172 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
173 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
174 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
175 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
176 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
177 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
178 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
179 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
180 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
181 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
182 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
183 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
184 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
185 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
186 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
187 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
188 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
189 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
190 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
191 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
192 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
193 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
194 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
195 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
196 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
197 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
198 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
199 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
200 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
201 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
202 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
203 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
205 /* Logs of 2 in the Galois field defined above. */
206 static const uint8_t vdev_raidz_log2[256] = {
207 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
208 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
209 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
210 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
211 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
212 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
213 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
214 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
215 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
216 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
217 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
218 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
219 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
220 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
221 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
222 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
223 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
224 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
225 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
226 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
227 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
228 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
229 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
230 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
231 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
232 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
233 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
234 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
235 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
236 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
237 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
238 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
241 static void vdev_raidz_generate_parity(raidz_map_t *rm);
244 * Multiply a given number by 2 raised to the given power.
246 static uint8_t
247 vdev_raidz_exp2(uint_t a, int exp)
249 if (a == 0)
250 return (0);
252 ASSERT(exp >= 0);
253 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
255 exp += vdev_raidz_log2[a];
256 if (exp > 255)
257 exp -= 255;
259 return (vdev_raidz_pow2[exp]);
262 static void
263 vdev_raidz_map_free(raidz_map_t *rm)
265 int c;
266 size_t size;
268 for (c = 0; c < rm->rm_firstdatacol; c++) {
269 abd_free(rm->rm_col[c].rc_abd);
271 if (rm->rm_col[c].rc_gdata != NULL)
272 zio_buf_free(rm->rm_col[c].rc_gdata,
273 rm->rm_col[c].rc_size);
276 size = 0;
277 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
278 abd_put(rm->rm_col[c].rc_abd);
279 size += rm->rm_col[c].rc_size;
282 if (rm->rm_abd_copy != NULL)
283 abd_free(rm->rm_abd_copy);
285 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
288 static void
289 vdev_raidz_map_free_vsd(zio_t *zio)
291 raidz_map_t *rm = zio->io_vsd;
293 ASSERT0(rm->rm_freed);
294 rm->rm_freed = 1;
296 if (rm->rm_reports == 0)
297 vdev_raidz_map_free(rm);
300 /*ARGSUSED*/
301 static void
302 vdev_raidz_cksum_free(void *arg, size_t ignored)
304 raidz_map_t *rm = arg;
306 ASSERT3U(rm->rm_reports, >, 0);
308 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
309 vdev_raidz_map_free(rm);
312 static void
313 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
315 raidz_map_t *rm = zcr->zcr_cbdata;
316 size_t c = zcr->zcr_cbinfo;
317 size_t x;
319 const char *good = NULL;
320 char *bad;
322 if (good_data == NULL) {
323 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
324 return;
327 if (c < rm->rm_firstdatacol) {
329 * The first time through, calculate the parity blocks for
330 * the good data (this relies on the fact that the good
331 * data never changes for a given logical ZIO)
333 if (rm->rm_col[0].rc_gdata == NULL) {
334 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
335 char *buf;
336 int offset;
339 * Set up the rm_col[]s to generate the parity for
340 * good_data, first saving the parity bufs and
341 * replacing them with buffers to hold the result.
343 for (x = 0; x < rm->rm_firstdatacol; x++) {
344 bad_parity[x] = rm->rm_col[x].rc_abd;
345 rm->rm_col[x].rc_gdata =
346 zio_buf_alloc(rm->rm_col[x].rc_size);
347 rm->rm_col[x].rc_abd =
348 abd_get_from_buf(rm->rm_col[x].rc_gdata,
349 rm->rm_col[x].rc_size);
352 /* fill in the data columns from good_data */
353 buf = (char *)good_data;
354 for (; x < rm->rm_cols; x++) {
355 abd_put(rm->rm_col[x].rc_abd);
356 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
357 rm->rm_col[x].rc_size);
358 buf += rm->rm_col[x].rc_size;
362 * Construct the parity from the good data.
364 vdev_raidz_generate_parity(rm);
366 /* restore everything back to its original state */
367 for (x = 0; x < rm->rm_firstdatacol; x++) {
368 abd_put(rm->rm_col[x].rc_abd);
369 rm->rm_col[x].rc_abd = bad_parity[x];
372 offset = 0;
373 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
374 abd_put(rm->rm_col[x].rc_abd);
375 rm->rm_col[x].rc_abd = abd_get_offset(
376 rm->rm_abd_copy, offset);
377 offset += rm->rm_col[x].rc_size;
381 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
382 good = rm->rm_col[c].rc_gdata;
383 } else {
384 /* adjust good_data to point at the start of our column */
385 good = good_data;
387 for (x = rm->rm_firstdatacol; x < c; x++)
388 good += rm->rm_col[x].rc_size;
391 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
392 /* we drop the ereport if it ends up that the data was good */
393 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
394 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
398 * Invoked indirectly by zfs_ereport_start_checksum(), called
399 * below when our read operation fails completely. The main point
400 * is to keep a copy of everything we read from disk, so that at
401 * vdev_raidz_cksum_finish() time we can compare it with the good data.
403 static void
404 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
406 size_t c = (size_t)(uintptr_t)arg;
407 size_t offset;
409 raidz_map_t *rm = zio->io_vsd;
410 size_t size;
412 /* set up the report and bump the refcount */
413 zcr->zcr_cbdata = rm;
414 zcr->zcr_cbinfo = c;
415 zcr->zcr_finish = vdev_raidz_cksum_finish;
416 zcr->zcr_free = vdev_raidz_cksum_free;
418 rm->rm_reports++;
419 ASSERT3U(rm->rm_reports, >, 0);
421 if (rm->rm_abd_copy != NULL)
422 return;
425 * It's the first time we're called for this raidz_map_t, so we need
426 * to copy the data aside; there's no guarantee that our zio's buffer
427 * won't be re-used for something else.
429 * Our parity data is already in separate buffers, so there's no need
430 * to copy them.
433 size = 0;
434 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
435 size += rm->rm_col[c].rc_size;
437 rm->rm_abd_copy =
438 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
440 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
441 raidz_col_t *col = &rm->rm_col[c];
442 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
444 abd_copy(tmp, col->rc_abd, col->rc_size);
445 abd_put(col->rc_abd);
446 col->rc_abd = tmp;
448 offset += col->rc_size;
450 ASSERT3U(offset, ==, size);
453 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
454 vdev_raidz_map_free_vsd,
455 vdev_raidz_cksum_report
459 * Divides the IO evenly across all child vdevs; usually, dcols is
460 * the number of children in the target vdev.
462 static raidz_map_t *
463 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
464 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
466 raidz_map_t *rm;
467 /* The starting RAIDZ (parent) vdev sector of the block. */
468 uint64_t b = offset >> unit_shift;
469 /* The zio's size in units of the vdev's minimum sector size. */
470 uint64_t s = size >> unit_shift;
471 /* The first column for this stripe. */
472 uint64_t f = b % dcols;
473 /* The starting byte offset on each child vdev. */
474 uint64_t o = (b / dcols) << unit_shift;
475 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
476 uint64_t off = 0;
479 * "Quotient": The number of data sectors for this stripe on all but
480 * the "big column" child vdevs that also contain "remainder" data.
482 q = s / (dcols - nparity);
485 * "Remainder": The number of partial stripe data sectors in this I/O.
486 * This will add a sector to some, but not all, child vdevs.
488 r = s - q * (dcols - nparity);
490 /* The number of "big columns" - those which contain remainder data. */
491 bc = (r == 0 ? 0 : r + nparity);
494 * The total number of data and parity sectors associated with
495 * this I/O.
497 tot = s + nparity * (q + (r == 0 ? 0 : 1));
499 /* acols: The columns that will be accessed. */
500 /* scols: The columns that will be accessed or skipped. */
501 if (q == 0) {
502 /* Our I/O request doesn't span all child vdevs. */
503 acols = bc;
504 scols = MIN(dcols, roundup(bc, nparity + 1));
505 } else {
506 acols = dcols;
507 scols = dcols;
510 ASSERT3U(acols, <=, scols);
512 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
514 rm->rm_cols = acols;
515 rm->rm_scols = scols;
516 rm->rm_bigcols = bc;
517 rm->rm_skipstart = bc;
518 rm->rm_missingdata = 0;
519 rm->rm_missingparity = 0;
520 rm->rm_firstdatacol = nparity;
521 rm->rm_abd_copy = NULL;
522 rm->rm_reports = 0;
523 rm->rm_freed = 0;
524 rm->rm_ecksuminjected = 0;
526 asize = 0;
528 for (c = 0; c < scols; c++) {
529 col = f + c;
530 coff = o;
531 if (col >= dcols) {
532 col -= dcols;
533 coff += 1ULL << unit_shift;
535 rm->rm_col[c].rc_devidx = col;
536 rm->rm_col[c].rc_offset = coff;
537 rm->rm_col[c].rc_abd = NULL;
538 rm->rm_col[c].rc_gdata = NULL;
539 rm->rm_col[c].rc_error = 0;
540 rm->rm_col[c].rc_tried = 0;
541 rm->rm_col[c].rc_skipped = 0;
543 if (c >= acols)
544 rm->rm_col[c].rc_size = 0;
545 else if (c < bc)
546 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
547 else
548 rm->rm_col[c].rc_size = q << unit_shift;
550 asize += rm->rm_col[c].rc_size;
553 ASSERT3U(asize, ==, tot << unit_shift);
554 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
555 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
556 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
557 ASSERT3U(rm->rm_nskip, <=, nparity);
559 for (c = 0; c < rm->rm_firstdatacol; c++)
560 rm->rm_col[c].rc_abd =
561 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
563 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
564 off = rm->rm_col[c].rc_size;
566 for (c = c + 1; c < acols; c++) {
567 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
568 off += rm->rm_col[c].rc_size;
572 * If all data stored spans all columns, there's a danger that parity
573 * will always be on the same device and, since parity isn't read
574 * during normal operation, that that device's I/O bandwidth won't be
575 * used effectively. We therefore switch the parity every 1MB.
577 * ... at least that was, ostensibly, the theory. As a practical
578 * matter unless we juggle the parity between all devices evenly, we
579 * won't see any benefit. Further, occasional writes that aren't a
580 * multiple of the LCM of the number of children and the minimum
581 * stripe width are sufficient to avoid pessimal behavior.
582 * Unfortunately, this decision created an implicit on-disk format
583 * requirement that we need to support for all eternity, but only
584 * for single-parity RAID-Z.
586 * If we intend to skip a sector in the zeroth column for padding
587 * we must make sure to note this swap. We will never intend to
588 * skip the first column since at least one data and one parity
589 * column must appear in each row.
591 ASSERT(rm->rm_cols >= 2);
592 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
594 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
595 devidx = rm->rm_col[0].rc_devidx;
596 o = rm->rm_col[0].rc_offset;
597 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
598 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
599 rm->rm_col[1].rc_devidx = devidx;
600 rm->rm_col[1].rc_offset = o;
602 if (rm->rm_skipstart == 0)
603 rm->rm_skipstart = 1;
606 return (rm);
609 struct pqr_struct {
610 uint64_t *p;
611 uint64_t *q;
612 uint64_t *r;
615 static int
616 vdev_raidz_p_func(void *buf, size_t size, void *private)
618 struct pqr_struct *pqr = private;
619 const uint64_t *src = buf;
620 int i, cnt = size / sizeof (src[0]);
622 ASSERT(pqr->p && !pqr->q && !pqr->r);
624 for (i = 0; i < cnt; i++, src++, pqr->p++)
625 *pqr->p ^= *src;
627 return (0);
630 static int
631 vdev_raidz_pq_func(void *buf, size_t size, void *private)
633 struct pqr_struct *pqr = private;
634 const uint64_t *src = buf;
635 uint64_t mask;
636 int i, cnt = size / sizeof (src[0]);
638 ASSERT(pqr->p && pqr->q && !pqr->r);
640 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
641 *pqr->p ^= *src;
642 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
643 *pqr->q ^= *src;
646 return (0);
649 static int
650 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
652 struct pqr_struct *pqr = private;
653 const uint64_t *src = buf;
654 uint64_t mask;
655 int i, cnt = size / sizeof (src[0]);
657 ASSERT(pqr->p && pqr->q && pqr->r);
659 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
660 *pqr->p ^= *src;
661 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
662 *pqr->q ^= *src;
663 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
664 *pqr->r ^= *src;
667 return (0);
670 static void
671 vdev_raidz_generate_parity_p(raidz_map_t *rm)
673 uint64_t *p;
674 int c;
675 abd_t *src;
677 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678 src = rm->rm_col[c].rc_abd;
679 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
681 if (c == rm->rm_firstdatacol) {
682 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
683 } else {
684 struct pqr_struct pqr = { p, NULL, NULL };
685 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
686 vdev_raidz_p_func, &pqr);
691 static void
692 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
694 uint64_t *p, *q, pcnt, ccnt, mask, i;
695 int c;
696 abd_t *src;
698 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
699 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
700 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
702 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
703 src = rm->rm_col[c].rc_abd;
704 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
705 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
707 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
709 if (c == rm->rm_firstdatacol) {
710 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
711 (void) memcpy(q, p, rm->rm_col[c].rc_size);
712 } else {
713 struct pqr_struct pqr = { p, q, NULL };
714 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
715 vdev_raidz_pq_func, &pqr);
718 if (c == rm->rm_firstdatacol) {
719 for (i = ccnt; i < pcnt; i++) {
720 p[i] = 0;
721 q[i] = 0;
723 } else {
725 * Treat short columns as though they are full of 0s.
726 * Note that there's therefore nothing needed for P.
728 for (i = ccnt; i < pcnt; i++) {
729 VDEV_RAIDZ_64MUL_2(q[i], mask);
735 static void
736 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
738 uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
739 int c;
740 abd_t *src;
742 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
743 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
744 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
745 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
746 rm->rm_col[VDEV_RAIDZ_R].rc_size);
748 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
749 src = rm->rm_col[c].rc_abd;
750 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
751 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
752 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
754 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
756 if (c == rm->rm_firstdatacol) {
757 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
758 (void) memcpy(q, p, rm->rm_col[c].rc_size);
759 (void) memcpy(r, p, rm->rm_col[c].rc_size);
760 } else {
761 struct pqr_struct pqr = { p, q, r };
762 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
763 vdev_raidz_pqr_func, &pqr);
766 if (c == rm->rm_firstdatacol) {
767 for (i = ccnt; i < pcnt; i++) {
768 p[i] = 0;
769 q[i] = 0;
770 r[i] = 0;
772 } else {
774 * Treat short columns as though they are full of 0s.
775 * Note that there's therefore nothing needed for P.
777 for (i = ccnt; i < pcnt; i++) {
778 VDEV_RAIDZ_64MUL_2(q[i], mask);
779 VDEV_RAIDZ_64MUL_4(r[i], mask);
786 * Generate RAID parity in the first virtual columns according to the number of
787 * parity columns available.
789 static void
790 vdev_raidz_generate_parity(raidz_map_t *rm)
792 switch (rm->rm_firstdatacol) {
793 case 1:
794 vdev_raidz_generate_parity_p(rm);
795 break;
796 case 2:
797 vdev_raidz_generate_parity_pq(rm);
798 break;
799 case 3:
800 vdev_raidz_generate_parity_pqr(rm);
801 break;
802 default:
803 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
807 /* ARGSUSED */
808 static int
809 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
811 uint64_t *dst = dbuf;
812 uint64_t *src = sbuf;
813 int cnt = size / sizeof (src[0]);
815 for (int i = 0; i < cnt; i++) {
816 dst[i] ^= src[i];
819 return (0);
822 /* ARGSUSED */
823 static int
824 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
825 void *private)
827 uint64_t *dst = dbuf;
828 uint64_t *src = sbuf;
829 uint64_t mask;
830 int cnt = size / sizeof (dst[0]);
832 for (int i = 0; i < cnt; i++, dst++, src++) {
833 VDEV_RAIDZ_64MUL_2(*dst, mask);
834 *dst ^= *src;
837 return (0);
840 /* ARGSUSED */
841 static int
842 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
844 uint64_t *dst = buf;
845 uint64_t mask;
846 int cnt = size / sizeof (dst[0]);
848 for (int i = 0; i < cnt; i++, dst++) {
849 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
850 VDEV_RAIDZ_64MUL_2(*dst, mask);
853 return (0);
856 struct reconst_q_struct {
857 uint64_t *q;
858 int exp;
861 static int
862 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
864 struct reconst_q_struct *rq = private;
865 uint64_t *dst = buf;
866 int cnt = size / sizeof (dst[0]);
868 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
869 *dst ^= *rq->q;
871 int j;
872 uint8_t *b;
873 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
874 *b = vdev_raidz_exp2(*b, rq->exp);
878 return (0);
881 struct reconst_pq_struct {
882 uint8_t *p;
883 uint8_t *q;
884 uint8_t *pxy;
885 uint8_t *qxy;
886 int aexp;
887 int bexp;
890 static int
891 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
893 struct reconst_pq_struct *rpq = private;
894 uint8_t *xd = xbuf;
895 uint8_t *yd = ybuf;
897 for (int i = 0; i < size;
898 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
899 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
900 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
901 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
904 return (0);
907 static int
908 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
910 struct reconst_pq_struct *rpq = private;
911 uint8_t *xd = xbuf;
913 for (int i = 0; i < size;
914 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
915 /* same operation as vdev_raidz_reconst_pq_func() on xd */
916 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
917 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
920 return (0);
923 static int
924 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
926 int x = tgts[0];
927 int c;
928 abd_t *dst, *src;
930 ASSERT(ntgts == 1);
931 ASSERT(x >= rm->rm_firstdatacol);
932 ASSERT(x < rm->rm_cols);
934 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
935 ASSERT(rm->rm_col[x].rc_size > 0);
937 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
938 dst = rm->rm_col[x].rc_abd;
940 abd_copy(dst, src, rm->rm_col[x].rc_size);
942 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
943 uint64_t size = MIN(rm->rm_col[x].rc_size,
944 rm->rm_col[c].rc_size);
946 src = rm->rm_col[c].rc_abd;
947 dst = rm->rm_col[x].rc_abd;
949 if (c == x)
950 continue;
952 (void) abd_iterate_func2(dst, src, 0, 0, size,
953 vdev_raidz_reconst_p_func, NULL);
956 return (1 << VDEV_RAIDZ_P);
959 static int
960 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
962 int x = tgts[0];
963 int c, exp;
964 abd_t *dst, *src;
966 ASSERT(ntgts == 1);
968 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
970 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
971 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
972 rm->rm_col[c].rc_size);
974 src = rm->rm_col[c].rc_abd;
975 dst = rm->rm_col[x].rc_abd;
977 if (c == rm->rm_firstdatacol) {
978 abd_copy(dst, src, size);
979 if (rm->rm_col[x].rc_size > size)
980 abd_zero_off(dst, size,
981 rm->rm_col[x].rc_size - size);
982 } else {
983 ASSERT3U(size, <=, rm->rm_col[x].rc_size);
984 (void) abd_iterate_func2(dst, src, 0, 0, size,
985 vdev_raidz_reconst_q_pre_func, NULL);
986 (void) abd_iterate_func(dst,
987 size, rm->rm_col[x].rc_size - size,
988 vdev_raidz_reconst_q_pre_tail_func, NULL);
992 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
993 dst = rm->rm_col[x].rc_abd;
994 exp = 255 - (rm->rm_cols - 1 - x);
996 struct reconst_q_struct rq = { abd_to_buf(src), exp };
997 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
998 vdev_raidz_reconst_q_post_func, &rq);
1000 return (1 << VDEV_RAIDZ_Q);
1003 static int
1004 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1006 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1007 abd_t *pdata, *qdata;
1008 uint64_t xsize, ysize;
1009 int x = tgts[0];
1010 int y = tgts[1];
1011 abd_t *xd, *yd;
1013 ASSERT(ntgts == 2);
1014 ASSERT(x < y);
1015 ASSERT(x >= rm->rm_firstdatacol);
1016 ASSERT(y < rm->rm_cols);
1018 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1021 * Move the parity data aside -- we're going to compute parity as
1022 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1023 * reuse the parity generation mechanism without trashing the actual
1024 * parity so we make those columns appear to be full of zeros by
1025 * setting their lengths to zero.
1027 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1028 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1029 xsize = rm->rm_col[x].rc_size;
1030 ysize = rm->rm_col[y].rc_size;
1032 rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1033 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1034 rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1035 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1036 rm->rm_col[x].rc_size = 0;
1037 rm->rm_col[y].rc_size = 0;
1039 vdev_raidz_generate_parity_pq(rm);
1041 rm->rm_col[x].rc_size = xsize;
1042 rm->rm_col[y].rc_size = ysize;
1044 p = abd_to_buf(pdata);
1045 q = abd_to_buf(qdata);
1046 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1047 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1048 xd = rm->rm_col[x].rc_abd;
1049 yd = rm->rm_col[y].rc_abd;
1052 * We now have:
1053 * Pxy = P + D_x + D_y
1054 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1056 * We can then solve for D_x:
1057 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1058 * where
1059 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1060 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1062 * With D_x in hand, we can easily solve for D_y:
1063 * D_y = P + Pxy + D_x
1066 a = vdev_raidz_pow2[255 + x - y];
1067 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1068 tmp = 255 - vdev_raidz_log2[a ^ 1];
1070 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1071 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1073 ASSERT3U(xsize, >=, ysize);
1074 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1075 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1076 vdev_raidz_reconst_pq_func, &rpq);
1077 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1078 vdev_raidz_reconst_pq_tail_func, &rpq);
1080 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1081 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1084 * Restore the saved parity data.
1086 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1087 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1089 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1092 /* BEGIN CSTYLED */
1094 * In the general case of reconstruction, we must solve the system of linear
1095 * equations defined by the coeffecients used to generate parity as well as
1096 * the contents of the data and parity disks. This can be expressed with
1097 * vectors for the original data (D) and the actual data (d) and parity (p)
1098 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1100 * __ __ __ __
1101 * | | __ __ | p_0 |
1102 * | V | | D_0 | | p_m-1 |
1103 * | | x | : | = | d_0 |
1104 * | I | | D_n-1 | | : |
1105 * | | ~~ ~~ | d_n-1 |
1106 * ~~ ~~ ~~ ~~
1108 * I is simply a square identity matrix of size n, and V is a vandermonde
1109 * matrix defined by the coeffecients we chose for the various parity columns
1110 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1111 * computation as well as linear separability.
1113 * __ __ __ __
1114 * | 1 .. 1 1 1 | | p_0 |
1115 * | 2^n-1 .. 4 2 1 | __ __ | : |
1116 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1117 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1118 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1119 * | : : : : | | : | | d_2 |
1120 * | 0 .. 1 0 0 | | D_n-1 | | : |
1121 * | 0 .. 0 1 0 | ~~ ~~ | : |
1122 * | 0 .. 0 0 1 | | d_n-1 |
1123 * ~~ ~~ ~~ ~~
1125 * Note that I, V, d, and p are known. To compute D, we must invert the
1126 * matrix and use the known data and parity values to reconstruct the unknown
1127 * data values. We begin by removing the rows in V|I and d|p that correspond
1128 * to failed or missing columns; we then make V|I square (n x n) and d|p
1129 * sized n by removing rows corresponding to unused parity from the bottom up
1130 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1131 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1132 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1133 * __ __
1134 * | 1 1 1 1 1 1 1 1 |
1135 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1136 * | 19 205 116 29 64 16 4 1 | / /
1137 * | 1 0 0 0 0 0 0 0 | / /
1138 * | 0 1 0 0 0 0 0 0 | <--' /
1139 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1140 * | 0 0 0 1 0 0 0 0 |
1141 * | 0 0 0 0 1 0 0 0 |
1142 * | 0 0 0 0 0 1 0 0 |
1143 * | 0 0 0 0 0 0 1 0 |
1144 * | 0 0 0 0 0 0 0 1 |
1145 * ~~ ~~
1146 * __ __
1147 * | 1 1 1 1 1 1 1 1 |
1148 * | 19 205 116 29 64 16 4 1 |
1149 * | 1 0 0 0 0 0 0 0 |
1150 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1151 * | 0 0 0 0 1 0 0 0 |
1152 * | 0 0 0 0 0 1 0 0 |
1153 * | 0 0 0 0 0 0 1 0 |
1154 * | 0 0 0 0 0 0 0 1 |
1155 * ~~ ~~
1157 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1158 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1159 * matrix is not singular.
1160 * __ __
1161 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1162 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1163 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1164 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1165 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1166 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1167 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1168 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1169 * ~~ ~~
1170 * __ __
1171 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1172 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1173 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1174 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1175 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1176 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1177 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1178 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1179 * ~~ ~~
1180 * __ __
1181 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1182 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1183 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1184 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1185 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1186 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1187 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1188 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1189 * ~~ ~~
1190 * __ __
1191 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1192 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1193 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1194 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1195 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1196 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1197 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1198 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1199 * ~~ ~~
1200 * __ __
1201 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1202 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1203 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1204 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1205 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1206 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1207 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1208 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1209 * ~~ ~~
1210 * __ __
1211 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1212 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1213 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1214 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1215 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1216 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1217 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1218 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1219 * ~~ ~~
1220 * __ __
1221 * | 0 0 1 0 0 0 0 0 |
1222 * | 167 100 5 41 159 169 217 208 |
1223 * | 166 100 4 40 158 168 216 209 |
1224 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1225 * | 0 0 0 0 1 0 0 0 |
1226 * | 0 0 0 0 0 1 0 0 |
1227 * | 0 0 0 0 0 0 1 0 |
1228 * | 0 0 0 0 0 0 0 1 |
1229 * ~~ ~~
1231 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1232 * of the missing data.
1234 * As is apparent from the example above, the only non-trivial rows in the
1235 * inverse matrix correspond to the data disks that we're trying to
1236 * reconstruct. Indeed, those are the only rows we need as the others would
1237 * only be useful for reconstructing data known or assumed to be valid. For
1238 * that reason, we only build the coefficients in the rows that correspond to
1239 * targeted columns.
1241 /* END CSTYLED */
1243 static void
1244 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1245 uint8_t **rows)
1247 int i, j;
1248 int pow;
1250 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1253 * Fill in the missing rows of interest.
1255 for (i = 0; i < nmap; i++) {
1256 ASSERT3S(0, <=, map[i]);
1257 ASSERT3S(map[i], <=, 2);
1259 pow = map[i] * n;
1260 if (pow > 255)
1261 pow -= 255;
1262 ASSERT(pow <= 255);
1264 for (j = 0; j < n; j++) {
1265 pow -= map[i];
1266 if (pow < 0)
1267 pow += 255;
1268 rows[i][j] = vdev_raidz_pow2[pow];
1273 static void
1274 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1275 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1277 int i, j, ii, jj;
1278 uint8_t log;
1281 * Assert that the first nmissing entries from the array of used
1282 * columns correspond to parity columns and that subsequent entries
1283 * correspond to data columns.
1285 for (i = 0; i < nmissing; i++) {
1286 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1288 for (; i < n; i++) {
1289 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1293 * First initialize the storage where we'll compute the inverse rows.
1295 for (i = 0; i < nmissing; i++) {
1296 for (j = 0; j < n; j++) {
1297 invrows[i][j] = (i == j) ? 1 : 0;
1302 * Subtract all trivial rows from the rows of consequence.
1304 for (i = 0; i < nmissing; i++) {
1305 for (j = nmissing; j < n; j++) {
1306 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1307 jj = used[j] - rm->rm_firstdatacol;
1308 ASSERT3S(jj, <, n);
1309 invrows[i][j] = rows[i][jj];
1310 rows[i][jj] = 0;
1315 * For each of the rows of interest, we must normalize it and subtract
1316 * a multiple of it from the other rows.
1318 for (i = 0; i < nmissing; i++) {
1319 for (j = 0; j < missing[i]; j++) {
1320 ASSERT0(rows[i][j]);
1322 ASSERT3U(rows[i][missing[i]], !=, 0);
1325 * Compute the inverse of the first element and multiply each
1326 * element in the row by that value.
1328 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1330 for (j = 0; j < n; j++) {
1331 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1332 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1335 for (ii = 0; ii < nmissing; ii++) {
1336 if (i == ii)
1337 continue;
1339 ASSERT3U(rows[ii][missing[i]], !=, 0);
1341 log = vdev_raidz_log2[rows[ii][missing[i]]];
1343 for (j = 0; j < n; j++) {
1344 rows[ii][j] ^=
1345 vdev_raidz_exp2(rows[i][j], log);
1346 invrows[ii][j] ^=
1347 vdev_raidz_exp2(invrows[i][j], log);
1353 * Verify that the data that is left in the rows are properly part of
1354 * an identity matrix.
1356 for (i = 0; i < nmissing; i++) {
1357 for (j = 0; j < n; j++) {
1358 if (j == missing[i]) {
1359 ASSERT3U(rows[i][j], ==, 1);
1360 } else {
1361 ASSERT0(rows[i][j]);
1367 static void
1368 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1369 int *missing, uint8_t **invrows, const uint8_t *used)
1371 int i, j, x, cc, c;
1372 uint8_t *src;
1373 uint64_t ccount;
1374 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1375 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1376 uint8_t log = 0;
1377 uint8_t val;
1378 int ll;
1379 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1380 uint8_t *p, *pp;
1381 size_t psize;
1383 psize = sizeof (invlog[0][0]) * n * nmissing;
1384 p = kmem_alloc(psize, KM_SLEEP);
1386 for (pp = p, i = 0; i < nmissing; i++) {
1387 invlog[i] = pp;
1388 pp += n;
1391 for (i = 0; i < nmissing; i++) {
1392 for (j = 0; j < n; j++) {
1393 ASSERT3U(invrows[i][j], !=, 0);
1394 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1398 for (i = 0; i < n; i++) {
1399 c = used[i];
1400 ASSERT3U(c, <, rm->rm_cols);
1402 src = abd_to_buf(rm->rm_col[c].rc_abd);
1403 ccount = rm->rm_col[c].rc_size;
1404 for (j = 0; j < nmissing; j++) {
1405 cc = missing[j] + rm->rm_firstdatacol;
1406 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1407 ASSERT3U(cc, <, rm->rm_cols);
1408 ASSERT3U(cc, !=, c);
1410 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1411 dcount[j] = rm->rm_col[cc].rc_size;
1414 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1416 for (x = 0; x < ccount; x++, src++) {
1417 if (*src != 0)
1418 log = vdev_raidz_log2[*src];
1420 for (cc = 0; cc < nmissing; cc++) {
1421 if (x >= dcount[cc])
1422 continue;
1424 if (*src == 0) {
1425 val = 0;
1426 } else {
1427 if ((ll = log + invlog[cc][i]) >= 255)
1428 ll -= 255;
1429 val = vdev_raidz_pow2[ll];
1432 if (i == 0)
1433 dst[cc][x] = val;
1434 else
1435 dst[cc][x] ^= val;
1440 kmem_free(p, psize);
1443 static int
1444 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1446 int n, i, c, t, tt;
1447 int nmissing_rows;
1448 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1449 int parity_map[VDEV_RAIDZ_MAXPARITY];
1451 uint8_t *p, *pp;
1452 size_t psize;
1454 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1455 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1456 uint8_t *used;
1458 abd_t **bufs = NULL;
1460 int code = 0;
1463 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1464 * temporary linear ABDs.
1466 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1467 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1469 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1470 raidz_col_t *col = &rm->rm_col[c];
1472 bufs[c] = col->rc_abd;
1473 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1474 abd_copy(col->rc_abd, bufs[c], col->rc_size);
1478 n = rm->rm_cols - rm->rm_firstdatacol;
1481 * Figure out which data columns are missing.
1483 nmissing_rows = 0;
1484 for (t = 0; t < ntgts; t++) {
1485 if (tgts[t] >= rm->rm_firstdatacol) {
1486 missing_rows[nmissing_rows++] =
1487 tgts[t] - rm->rm_firstdatacol;
1492 * Figure out which parity columns to use to help generate the missing
1493 * data columns.
1495 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1496 ASSERT(tt < ntgts);
1497 ASSERT(c < rm->rm_firstdatacol);
1500 * Skip any targeted parity columns.
1502 if (c == tgts[tt]) {
1503 tt++;
1504 continue;
1507 code |= 1 << c;
1509 parity_map[i] = c;
1510 i++;
1513 ASSERT(code != 0);
1514 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1516 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1517 nmissing_rows * n + sizeof (used[0]) * n;
1518 p = kmem_alloc(psize, KM_SLEEP);
1520 for (pp = p, i = 0; i < nmissing_rows; i++) {
1521 rows[i] = pp;
1522 pp += n;
1523 invrows[i] = pp;
1524 pp += n;
1526 used = pp;
1528 for (i = 0; i < nmissing_rows; i++) {
1529 used[i] = parity_map[i];
1532 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1533 if (tt < nmissing_rows &&
1534 c == missing_rows[tt] + rm->rm_firstdatacol) {
1535 tt++;
1536 continue;
1539 ASSERT3S(i, <, n);
1540 used[i] = c;
1541 i++;
1545 * Initialize the interesting rows of the matrix.
1547 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1550 * Invert the matrix.
1552 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1553 invrows, used);
1556 * Reconstruct the missing data using the generated matrix.
1558 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1559 invrows, used);
1561 kmem_free(p, psize);
1564 * copy back from temporary linear abds and free them
1566 if (bufs) {
1567 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1568 raidz_col_t *col = &rm->rm_col[c];
1570 abd_copy(bufs[c], col->rc_abd, col->rc_size);
1571 abd_free(col->rc_abd);
1572 col->rc_abd = bufs[c];
1574 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1577 return (code);
1580 static int
1581 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1583 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1584 int ntgts;
1585 int i, c;
1586 int code;
1587 int nbadparity, nbaddata;
1588 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1591 * The tgts list must already be sorted.
1593 for (i = 1; i < nt; i++) {
1594 ASSERT(t[i] > t[i - 1]);
1597 nbadparity = rm->rm_firstdatacol;
1598 nbaddata = rm->rm_cols - nbadparity;
1599 ntgts = 0;
1600 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1601 if (c < rm->rm_firstdatacol)
1602 parity_valid[c] = B_FALSE;
1604 if (i < nt && c == t[i]) {
1605 tgts[ntgts++] = c;
1606 i++;
1607 } else if (rm->rm_col[c].rc_error != 0) {
1608 tgts[ntgts++] = c;
1609 } else if (c >= rm->rm_firstdatacol) {
1610 nbaddata--;
1611 } else {
1612 parity_valid[c] = B_TRUE;
1613 nbadparity--;
1617 ASSERT(ntgts >= nt);
1618 ASSERT(nbaddata >= 0);
1619 ASSERT(nbaddata + nbadparity == ntgts);
1621 dt = &tgts[nbadparity];
1624 * See if we can use any of our optimized reconstruction routines.
1626 if (!vdev_raidz_default_to_general) {
1627 switch (nbaddata) {
1628 case 1:
1629 if (parity_valid[VDEV_RAIDZ_P])
1630 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1632 ASSERT(rm->rm_firstdatacol > 1);
1634 if (parity_valid[VDEV_RAIDZ_Q])
1635 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1637 ASSERT(rm->rm_firstdatacol > 2);
1638 break;
1640 case 2:
1641 ASSERT(rm->rm_firstdatacol > 1);
1643 if (parity_valid[VDEV_RAIDZ_P] &&
1644 parity_valid[VDEV_RAIDZ_Q])
1645 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1647 ASSERT(rm->rm_firstdatacol > 2);
1649 break;
1653 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1654 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1655 ASSERT(code > 0);
1656 return (code);
1659 static int
1660 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1661 uint64_t *ashift)
1663 vdev_t *cvd;
1664 uint64_t nparity = vd->vdev_nparity;
1665 int c;
1666 int lasterror = 0;
1667 int numerrors = 0;
1669 ASSERT(nparity > 0);
1671 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1672 vd->vdev_children < nparity + 1) {
1673 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1674 return (SET_ERROR(EINVAL));
1677 vdev_open_children(vd);
1679 for (c = 0; c < vd->vdev_children; c++) {
1680 cvd = vd->vdev_child[c];
1682 if (cvd->vdev_open_error != 0) {
1683 lasterror = cvd->vdev_open_error;
1684 numerrors++;
1685 continue;
1688 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1689 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1690 *ashift = MAX(*ashift, cvd->vdev_ashift);
1693 *asize *= vd->vdev_children;
1694 *max_asize *= vd->vdev_children;
1696 if (numerrors > nparity) {
1697 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1698 return (lasterror);
1701 return (0);
1704 static void
1705 vdev_raidz_close(vdev_t *vd)
1707 int c;
1709 for (c = 0; c < vd->vdev_children; c++)
1710 vdev_close(vd->vdev_child[c]);
1714 * Handle a read or write I/O to a RAID-Z dump device.
1716 * The dump device is in a unique situation compared to other ZFS datasets:
1717 * writing to this device should be as simple and fast as possible. In
1718 * addition, durability matters much less since the dump will be extracted
1719 * once the machine reboots. For that reason, this function eschews parity for
1720 * performance and simplicity. The dump device uses the checksum setting
1721 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1722 * dataset.
1724 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1725 * 128 KB will not fill an entire block; in addition, they may not be properly
1726 * aligned. In that case, this function uses the preallocated 128 KB block and
1727 * omits reading or writing any "empty" portions of that block, as opposed to
1728 * allocating a fresh appropriately-sized block.
1730 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1732 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1734 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1735 * allocated which spans all five child vdevs. 8 KB of data would be written to
1736 * each of four vdevs, with the fifth containing the parity bits.
1738 * parity data data data data
1739 * | PP | XX | XX | XX | XX |
1740 * ^ ^ ^ ^ ^
1741 * | | | | |
1742 * 8 KB parity ------8 KB data blocks------
1744 * However, when writing to the dump device, the behavior is different:
1746 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1748 * Unlike the normal RAID-Z case in which the block is allocated based on the
1749 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1750 * I/O size is less than 128 KB, only the actual portions of data are written.
1751 * In this example the data is written to the third data vdev since that vdev
1752 * contains the offset [64 KB, 96 KB).
1754 * parity data data data data
1755 * | | | | XX | |
1758 * 32 KB data block
1760 * As a result, an individual I/O may not span all child vdevs; moreover, a
1761 * small I/O may only operate on a single child vdev.
1763 * Note that since there are no parity bits calculated or written, this format
1764 * remains the same no matter how many parity bits are used in a normal RAID-Z
1765 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1766 * would look like:
1768 * parity parity parity data data data data
1769 * | | | | | | XX | |
1772 * 32 KB data block
1775 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1776 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1778 vdev_t *tvd = vd->vdev_top;
1779 vdev_t *cvd;
1780 raidz_map_t *rm;
1781 raidz_col_t *rc;
1782 int c, err = 0;
1784 uint64_t start, end, colstart, colend;
1785 uint64_t coloffset, colsize, colskip;
1787 int flags = doread ? B_READ : B_WRITE;
1789 #ifdef _KERNEL
1792 * Don't write past the end of the block
1794 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1796 start = offset;
1797 end = start + size;
1800 * Allocate a RAID-Z map for this block. Note that this block starts
1801 * from the "original" offset, this is, the offset of the extent which
1802 * contains the requisite offset of the data being read or written.
1804 * Even if this I/O operation doesn't span the full block size, let's
1805 * treat the on-disk format as if the only blocks are the complete 128
1806 * KB size.
1808 abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1809 SPA_OLD_MAXBLOCKSIZE);
1810 rm = vdev_raidz_map_alloc(abd,
1811 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1812 vd->vdev_children, vd->vdev_nparity);
1814 coloffset = origoffset;
1816 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1817 c++, coloffset += rc->rc_size) {
1818 rc = &rm->rm_col[c];
1819 cvd = vd->vdev_child[rc->rc_devidx];
1822 * Find the start and end of this column in the RAID-Z map,
1823 * keeping in mind that the stated size and offset of the
1824 * operation may not fill the entire column for this vdev.
1826 * If any portion of the data spans this column, issue the
1827 * appropriate operation to the vdev.
1829 if (coloffset + rc->rc_size <= start)
1830 continue;
1831 if (coloffset >= end)
1832 continue;
1834 colstart = MAX(coloffset, start);
1835 colend = MIN(end, coloffset + rc->rc_size);
1836 colsize = colend - colstart;
1837 colskip = colstart - coloffset;
1839 VERIFY3U(colsize, <=, rc->rc_size);
1840 VERIFY3U(colskip, <=, rc->rc_size);
1843 * Note that the child vdev will have a vdev label at the start
1844 * of its range of offsets, hence the need for
1845 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1846 * example of why this calculation is needed.
1848 if ((err = vdev_disk_physio(cvd,
1849 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1850 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1851 flags, isdump)) != 0)
1852 break;
1855 vdev_raidz_map_free(rm);
1856 abd_put(abd);
1857 #endif /* KERNEL */
1859 return (err);
1862 static uint64_t
1863 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1865 uint64_t asize;
1866 uint64_t ashift = vd->vdev_top->vdev_ashift;
1867 uint64_t cols = vd->vdev_children;
1868 uint64_t nparity = vd->vdev_nparity;
1870 asize = ((psize - 1) >> ashift) + 1;
1871 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1872 asize = roundup(asize, nparity + 1) << ashift;
1874 return (asize);
1877 static void
1878 vdev_raidz_child_done(zio_t *zio)
1880 raidz_col_t *rc = zio->io_private;
1882 rc->rc_error = zio->io_error;
1883 rc->rc_tried = 1;
1884 rc->rc_skipped = 0;
1888 * Start an IO operation on a RAIDZ VDev
1890 * Outline:
1891 * - For write operations:
1892 * 1. Generate the parity data
1893 * 2. Create child zio write operations to each column's vdev, for both
1894 * data and parity.
1895 * 3. If the column skips any sectors for padding, create optional dummy
1896 * write zio children for those areas to improve aggregation continuity.
1897 * - For read operations:
1898 * 1. Create child zio read operations to each data column's vdev to read
1899 * the range of data required for zio.
1900 * 2. If this is a scrub or resilver operation, or if any of the data
1901 * vdevs have had errors, then create zio read operations to the parity
1902 * columns' VDevs as well.
1904 static void
1905 vdev_raidz_io_start(zio_t *zio)
1907 vdev_t *vd = zio->io_vd;
1908 vdev_t *tvd = vd->vdev_top;
1909 vdev_t *cvd;
1910 raidz_map_t *rm;
1911 raidz_col_t *rc;
1912 int c, i;
1914 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1915 tvd->vdev_ashift, vd->vdev_children,
1916 vd->vdev_nparity);
1918 zio->io_vsd = rm;
1919 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1921 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1923 if (zio->io_type == ZIO_TYPE_WRITE) {
1924 vdev_raidz_generate_parity(rm);
1926 for (c = 0; c < rm->rm_cols; c++) {
1927 rc = &rm->rm_col[c];
1928 cvd = vd->vdev_child[rc->rc_devidx];
1929 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1930 rc->rc_offset, rc->rc_abd, rc->rc_size,
1931 zio->io_type, zio->io_priority, 0,
1932 vdev_raidz_child_done, rc));
1936 * Generate optional I/Os for any skipped sectors to improve
1937 * aggregation contiguity.
1939 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1940 ASSERT(c <= rm->rm_scols);
1941 if (c == rm->rm_scols)
1942 c = 0;
1943 rc = &rm->rm_col[c];
1944 cvd = vd->vdev_child[rc->rc_devidx];
1945 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1946 rc->rc_offset + rc->rc_size, NULL,
1947 1 << tvd->vdev_ashift,
1948 zio->io_type, zio->io_priority,
1949 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1952 zio_execute(zio);
1953 return;
1956 ASSERT(zio->io_type == ZIO_TYPE_READ);
1959 * Iterate over the columns in reverse order so that we hit the parity
1960 * last -- any errors along the way will force us to read the parity.
1962 for (c = rm->rm_cols - 1; c >= 0; c--) {
1963 rc = &rm->rm_col[c];
1964 cvd = vd->vdev_child[rc->rc_devidx];
1965 if (!vdev_readable(cvd)) {
1966 if (c >= rm->rm_firstdatacol)
1967 rm->rm_missingdata++;
1968 else
1969 rm->rm_missingparity++;
1970 rc->rc_error = SET_ERROR(ENXIO);
1971 rc->rc_tried = 1; /* don't even try */
1972 rc->rc_skipped = 1;
1973 continue;
1975 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1976 if (c >= rm->rm_firstdatacol)
1977 rm->rm_missingdata++;
1978 else
1979 rm->rm_missingparity++;
1980 rc->rc_error = SET_ERROR(ESTALE);
1981 rc->rc_skipped = 1;
1982 continue;
1984 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1985 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1986 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1987 rc->rc_offset, rc->rc_abd, rc->rc_size,
1988 zio->io_type, zio->io_priority, 0,
1989 vdev_raidz_child_done, rc));
1993 zio_execute(zio);
1998 * Report a checksum error for a child of a RAID-Z device.
2000 static void
2001 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2003 void *buf;
2004 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2006 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2007 zio_bad_cksum_t zbc;
2008 raidz_map_t *rm = zio->io_vsd;
2010 mutex_enter(&vd->vdev_stat_lock);
2011 vd->vdev_stat.vs_checksum_errors++;
2012 mutex_exit(&vd->vdev_stat_lock);
2014 zbc.zbc_has_cksum = 0;
2015 zbc.zbc_injected = rm->rm_ecksuminjected;
2017 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2018 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2019 rc->rc_offset, rc->rc_size, buf, bad_data,
2020 &zbc);
2021 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2026 * We keep track of whether or not there were any injected errors, so that
2027 * any ereports we generate can note it.
2029 static int
2030 raidz_checksum_verify(zio_t *zio)
2032 zio_bad_cksum_t zbc;
2033 raidz_map_t *rm = zio->io_vsd;
2035 int ret = zio_checksum_error(zio, &zbc);
2036 if (ret != 0 && zbc.zbc_injected != 0)
2037 rm->rm_ecksuminjected = 1;
2039 return (ret);
2043 * Generate the parity from the data columns. If we tried and were able to
2044 * read the parity without error, verify that the generated parity matches the
2045 * data we read. If it doesn't, we fire off a checksum error. Return the
2046 * number such failures.
2048 static int
2049 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2051 void *orig[VDEV_RAIDZ_MAXPARITY];
2052 int c, ret = 0;
2053 raidz_col_t *rc;
2055 blkptr_t *bp = zio->io_bp;
2056 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2057 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2059 if (checksum == ZIO_CHECKSUM_NOPARITY)
2060 return (ret);
2062 for (c = 0; c < rm->rm_firstdatacol; c++) {
2063 rc = &rm->rm_col[c];
2064 if (!rc->rc_tried || rc->rc_error != 0)
2065 continue;
2066 orig[c] = zio_buf_alloc(rc->rc_size);
2067 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2070 vdev_raidz_generate_parity(rm);
2072 for (c = 0; c < rm->rm_firstdatacol; c++) {
2073 rc = &rm->rm_col[c];
2074 if (!rc->rc_tried || rc->rc_error != 0)
2075 continue;
2076 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2077 raidz_checksum_error(zio, rc, orig[c]);
2078 rc->rc_error = SET_ERROR(ECKSUM);
2079 ret++;
2081 zio_buf_free(orig[c], rc->rc_size);
2084 return (ret);
2088 * Keep statistics on all the ways that we used parity to correct data.
2090 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2092 static int
2093 vdev_raidz_worst_error(raidz_map_t *rm)
2095 int error = 0;
2097 for (int c = 0; c < rm->rm_cols; c++)
2098 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2100 return (error);
2104 * Iterate over all combinations of bad data and attempt a reconstruction.
2105 * Note that the algorithm below is non-optimal because it doesn't take into
2106 * account how reconstruction is actually performed. For example, with
2107 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2108 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2109 * cases we'd only use parity information in column 0.
2111 static int
2112 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2114 raidz_map_t *rm = zio->io_vsd;
2115 raidz_col_t *rc;
2116 void *orig[VDEV_RAIDZ_MAXPARITY];
2117 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2118 int *tgts = &tstore[1];
2119 int current, next, i, c, n;
2120 int code, ret = 0;
2122 ASSERT(total_errors < rm->rm_firstdatacol);
2125 * This simplifies one edge condition.
2127 tgts[-1] = -1;
2129 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2131 * Initialize the targets array by finding the first n columns
2132 * that contain no error.
2134 * If there were no data errors, we need to ensure that we're
2135 * always explicitly attempting to reconstruct at least one
2136 * data column. To do this, we simply push the highest target
2137 * up into the data columns.
2139 for (c = 0, i = 0; i < n; i++) {
2140 if (i == n - 1 && data_errors == 0 &&
2141 c < rm->rm_firstdatacol) {
2142 c = rm->rm_firstdatacol;
2145 while (rm->rm_col[c].rc_error != 0) {
2146 c++;
2147 ASSERT3S(c, <, rm->rm_cols);
2150 tgts[i] = c++;
2154 * Setting tgts[n] simplifies the other edge condition.
2156 tgts[n] = rm->rm_cols;
2159 * These buffers were allocated in previous iterations.
2161 for (i = 0; i < n - 1; i++) {
2162 ASSERT(orig[i] != NULL);
2165 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2167 current = 0;
2168 next = tgts[current];
2170 while (current != n) {
2171 tgts[current] = next;
2172 current = 0;
2175 * Save off the original data that we're going to
2176 * attempt to reconstruct.
2178 for (i = 0; i < n; i++) {
2179 ASSERT(orig[i] != NULL);
2180 c = tgts[i];
2181 ASSERT3S(c, >=, 0);
2182 ASSERT3S(c, <, rm->rm_cols);
2183 rc = &rm->rm_col[c];
2184 abd_copy_to_buf(orig[i], rc->rc_abd,
2185 rc->rc_size);
2189 * Attempt a reconstruction and exit the outer loop on
2190 * success.
2192 code = vdev_raidz_reconstruct(rm, tgts, n);
2193 if (raidz_checksum_verify(zio) == 0) {
2194 atomic_inc_64(&raidz_corrected[code]);
2196 for (i = 0; i < n; i++) {
2197 c = tgts[i];
2198 rc = &rm->rm_col[c];
2199 ASSERT(rc->rc_error == 0);
2200 if (rc->rc_tried)
2201 raidz_checksum_error(zio, rc,
2202 orig[i]);
2203 rc->rc_error = SET_ERROR(ECKSUM);
2206 ret = code;
2207 goto done;
2211 * Restore the original data.
2213 for (i = 0; i < n; i++) {
2214 c = tgts[i];
2215 rc = &rm->rm_col[c];
2216 abd_copy_from_buf(rc->rc_abd, orig[i],
2217 rc->rc_size);
2220 do {
2222 * Find the next valid column after the current
2223 * position..
2225 for (next = tgts[current] + 1;
2226 next < rm->rm_cols &&
2227 rm->rm_col[next].rc_error != 0; next++)
2228 continue;
2230 ASSERT(next <= tgts[current + 1]);
2233 * If that spot is available, we're done here.
2235 if (next != tgts[current + 1])
2236 break;
2239 * Otherwise, find the next valid column after
2240 * the previous position.
2242 for (c = tgts[current - 1] + 1;
2243 rm->rm_col[c].rc_error != 0; c++)
2244 continue;
2246 tgts[current] = c;
2247 current++;
2249 } while (current != n);
2252 n--;
2253 done:
2254 for (i = 0; i < n; i++) {
2255 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2258 return (ret);
2262 * Complete an IO operation on a RAIDZ VDev
2264 * Outline:
2265 * - For write operations:
2266 * 1. Check for errors on the child IOs.
2267 * 2. Return, setting an error code if too few child VDevs were written
2268 * to reconstruct the data later. Note that partial writes are
2269 * considered successful if they can be reconstructed at all.
2270 * - For read operations:
2271 * 1. Check for errors on the child IOs.
2272 * 2. If data errors occurred:
2273 * a. Try to reassemble the data from the parity available.
2274 * b. If we haven't yet read the parity drives, read them now.
2275 * c. If all parity drives have been read but the data still doesn't
2276 * reassemble with a correct checksum, then try combinatorial
2277 * reconstruction.
2278 * d. If that doesn't work, return an error.
2279 * 3. If there were unexpected errors or this is a resilver operation,
2280 * rewrite the vdevs that had errors.
2282 static void
2283 vdev_raidz_io_done(zio_t *zio)
2285 vdev_t *vd = zio->io_vd;
2286 vdev_t *cvd;
2287 raidz_map_t *rm = zio->io_vsd;
2288 raidz_col_t *rc;
2289 int unexpected_errors = 0;
2290 int parity_errors = 0;
2291 int parity_untried = 0;
2292 int data_errors = 0;
2293 int total_errors = 0;
2294 int n, c;
2295 int tgts[VDEV_RAIDZ_MAXPARITY];
2296 int code;
2298 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2300 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2301 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2303 for (c = 0; c < rm->rm_cols; c++) {
2304 rc = &rm->rm_col[c];
2306 if (rc->rc_error) {
2307 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2309 if (c < rm->rm_firstdatacol)
2310 parity_errors++;
2311 else
2312 data_errors++;
2314 if (!rc->rc_skipped)
2315 unexpected_errors++;
2317 total_errors++;
2318 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2319 parity_untried++;
2323 if (zio->io_type == ZIO_TYPE_WRITE) {
2325 * XXX -- for now, treat partial writes as a success.
2326 * (If we couldn't write enough columns to reconstruct
2327 * the data, the I/O failed. Otherwise, good enough.)
2329 * Now that we support write reallocation, it would be better
2330 * to treat partial failure as real failure unless there are
2331 * no non-degraded top-level vdevs left, and not update DTLs
2332 * if we intend to reallocate.
2334 /* XXPOLICY */
2335 if (total_errors > rm->rm_firstdatacol)
2336 zio->io_error = vdev_raidz_worst_error(rm);
2338 return;
2341 ASSERT(zio->io_type == ZIO_TYPE_READ);
2343 * There are three potential phases for a read:
2344 * 1. produce valid data from the columns read
2345 * 2. read all disks and try again
2346 * 3. perform combinatorial reconstruction
2348 * Each phase is progressively both more expensive and less likely to
2349 * occur. If we encounter more errors than we can repair or all phases
2350 * fail, we have no choice but to return an error.
2354 * If the number of errors we saw was correctable -- less than or equal
2355 * to the number of parity disks read -- attempt to produce data that
2356 * has a valid checksum. Naturally, this case applies in the absence of
2357 * any errors.
2359 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2360 if (data_errors == 0) {
2361 if (raidz_checksum_verify(zio) == 0) {
2363 * If we read parity information (unnecessarily
2364 * as it happens since no reconstruction was
2365 * needed) regenerate and verify the parity.
2366 * We also regenerate parity when resilvering
2367 * so we can write it out to the failed device
2368 * later.
2370 if (parity_errors + parity_untried <
2371 rm->rm_firstdatacol ||
2372 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2373 n = raidz_parity_verify(zio, rm);
2374 unexpected_errors += n;
2375 ASSERT(parity_errors + n <=
2376 rm->rm_firstdatacol);
2378 goto done;
2380 } else {
2382 * We either attempt to read all the parity columns or
2383 * none of them. If we didn't try to read parity, we
2384 * wouldn't be here in the correctable case. There must
2385 * also have been fewer parity errors than parity
2386 * columns or, again, we wouldn't be in this code path.
2388 ASSERT(parity_untried == 0);
2389 ASSERT(parity_errors < rm->rm_firstdatacol);
2392 * Identify the data columns that reported an error.
2394 n = 0;
2395 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2396 rc = &rm->rm_col[c];
2397 if (rc->rc_error != 0) {
2398 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2399 tgts[n++] = c;
2403 ASSERT(rm->rm_firstdatacol >= n);
2405 code = vdev_raidz_reconstruct(rm, tgts, n);
2407 if (raidz_checksum_verify(zio) == 0) {
2408 atomic_inc_64(&raidz_corrected[code]);
2411 * If we read more parity disks than were used
2412 * for reconstruction, confirm that the other
2413 * parity disks produced correct data. This
2414 * routine is suboptimal in that it regenerates
2415 * the parity that we already used in addition
2416 * to the parity that we're attempting to
2417 * verify, but this should be a relatively
2418 * uncommon case, and can be optimized if it
2419 * becomes a problem. Note that we regenerate
2420 * parity when resilvering so we can write it
2421 * out to failed devices later.
2423 if (parity_errors < rm->rm_firstdatacol - n ||
2424 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2425 n = raidz_parity_verify(zio, rm);
2426 unexpected_errors += n;
2427 ASSERT(parity_errors + n <=
2428 rm->rm_firstdatacol);
2431 goto done;
2437 * This isn't a typical situation -- either we got a read error or
2438 * a child silently returned bad data. Read every block so we can
2439 * try again with as much data and parity as we can track down. If
2440 * we've already been through once before, all children will be marked
2441 * as tried so we'll proceed to combinatorial reconstruction.
2443 unexpected_errors = 1;
2444 rm->rm_missingdata = 0;
2445 rm->rm_missingparity = 0;
2447 for (c = 0; c < rm->rm_cols; c++) {
2448 if (rm->rm_col[c].rc_tried)
2449 continue;
2451 zio_vdev_io_redone(zio);
2452 do {
2453 rc = &rm->rm_col[c];
2454 if (rc->rc_tried)
2455 continue;
2456 zio_nowait(zio_vdev_child_io(zio, NULL,
2457 vd->vdev_child[rc->rc_devidx],
2458 rc->rc_offset, rc->rc_abd, rc->rc_size,
2459 zio->io_type, zio->io_priority, 0,
2460 vdev_raidz_child_done, rc));
2461 } while (++c < rm->rm_cols);
2463 return;
2467 * At this point we've attempted to reconstruct the data given the
2468 * errors we detected, and we've attempted to read all columns. There
2469 * must, therefore, be one or more additional problems -- silent errors
2470 * resulting in invalid data rather than explicit I/O errors resulting
2471 * in absent data. We check if there is enough additional data to
2472 * possibly reconstruct the data and then perform combinatorial
2473 * reconstruction over all possible combinations. If that fails,
2474 * we're cooked.
2476 if (total_errors > rm->rm_firstdatacol) {
2477 zio->io_error = vdev_raidz_worst_error(rm);
2479 } else if (total_errors < rm->rm_firstdatacol &&
2480 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2482 * If we didn't use all the available parity for the
2483 * combinatorial reconstruction, verify that the remaining
2484 * parity is correct.
2486 if (code != (1 << rm->rm_firstdatacol) - 1)
2487 (void) raidz_parity_verify(zio, rm);
2488 } else {
2490 * We're here because either:
2492 * total_errors == rm_first_datacol, or
2493 * vdev_raidz_combrec() failed
2495 * In either case, there is enough bad data to prevent
2496 * reconstruction.
2498 * Start checksum ereports for all children which haven't
2499 * failed, and the IO wasn't speculative.
2501 zio->io_error = SET_ERROR(ECKSUM);
2503 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2504 for (c = 0; c < rm->rm_cols; c++) {
2505 rc = &rm->rm_col[c];
2506 if (rc->rc_error == 0) {
2507 zio_bad_cksum_t zbc;
2508 zbc.zbc_has_cksum = 0;
2509 zbc.zbc_injected =
2510 rm->rm_ecksuminjected;
2512 zfs_ereport_start_checksum(
2513 zio->io_spa,
2514 vd->vdev_child[rc->rc_devidx],
2515 zio, rc->rc_offset, rc->rc_size,
2516 (void *)(uintptr_t)c, &zbc);
2522 done:
2523 zio_checksum_verified(zio);
2525 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2526 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2528 * Use the good data we have in hand to repair damaged children.
2530 for (c = 0; c < rm->rm_cols; c++) {
2531 rc = &rm->rm_col[c];
2532 cvd = vd->vdev_child[rc->rc_devidx];
2534 if (rc->rc_error == 0)
2535 continue;
2537 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2538 rc->rc_offset, rc->rc_abd, rc->rc_size,
2539 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2540 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2541 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2546 static void
2547 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2549 if (faulted > vd->vdev_nparity)
2550 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2551 VDEV_AUX_NO_REPLICAS);
2552 else if (degraded + faulted != 0)
2553 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2554 else
2555 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2558 vdev_ops_t vdev_raidz_ops = {
2559 vdev_raidz_open,
2560 vdev_raidz_close,
2561 vdev_raidz_asize,
2562 vdev_raidz_io_start,
2563 vdev_raidz_io_done,
2564 vdev_raidz_state_change,
2565 NULL,
2566 NULL,
2567 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2568 B_FALSE /* not a leaf vdev */