Merge commit '7e3488dc6cdcb0c04e1ce167a1a3bfef83b5f2e0'
[unleashed.git] / kernel / fs / zfs / vdev_raidz.c
blob0e6dfcc2c02ed1f841e670d081d2dcce79af2734
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_disk.h>
33 #include <sys/vdev_file.h>
34 #include <sys/vdev_raidz.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/abd.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/fm/fs/zfs.h>
41 #ifdef ZFS_DEBUG
42 #include <sys/vdev_initialize.h> /* vdev_xlate testing */
43 #endif
46 * Virtual device vector for RAID-Z.
48 * This vdev supports single, double, and triple parity. For single parity,
49 * we use a simple XOR of all the data columns. For double or triple parity,
50 * we use a special case of Reed-Solomon coding. This extends the
51 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
52 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
53 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
54 * former is also based. The latter is designed to provide higher performance
55 * for writes.
57 * Note that the Plank paper claimed to support arbitrary N+M, but was then
58 * amended six years later identifying a critical flaw that invalidates its
59 * claims. Nevertheless, the technique can be adapted to work for up to
60 * triple parity. For additional parity, the amendment "Note: Correction to
61 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
62 * is viable, but the additional complexity means that write performance will
63 * suffer.
65 * All of the methods above operate on a Galois field, defined over the
66 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
67 * can be expressed with a single byte. Briefly, the operations on the
68 * field are defined as follows:
70 * o addition (+) is represented by a bitwise XOR
71 * o subtraction (-) is therefore identical to addition: A + B = A - B
72 * o multiplication of A by 2 is defined by the following bitwise expression:
74 * (A * 2)_7 = A_6
75 * (A * 2)_6 = A_5
76 * (A * 2)_5 = A_4
77 * (A * 2)_4 = A_3 + A_7
78 * (A * 2)_3 = A_2 + A_7
79 * (A * 2)_2 = A_1 + A_7
80 * (A * 2)_1 = A_0
81 * (A * 2)_0 = A_7
83 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
84 * As an aside, this multiplication is derived from the error correcting
85 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
87 * Observe that any number in the field (except for 0) can be expressed as a
88 * power of 2 -- a generator for the field. We store a table of the powers of
89 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
90 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
91 * than field addition). The inverse of a field element A (A^-1) is therefore
92 * A ^ (255 - 1) = A^254.
94 * The up-to-three parity columns, P, Q, R over several data columns,
95 * D_0, ... D_n-1, can be expressed by field operations:
97 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
98 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
99 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
100 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
101 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
103 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
104 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
105 * independent coefficients. (There are no additional coefficients that have
106 * this property which is why the uncorrected Plank method breaks down.)
108 * See the reconstruction code below for how P, Q and R can used individually
109 * or in concert to recover missing data columns.
112 typedef struct raidz_col {
113 uint64_t rc_devidx; /* child device index for I/O */
114 uint64_t rc_offset; /* device offset */
115 uint64_t rc_size; /* I/O size */
116 abd_t *rc_abd; /* I/O data */
117 void *rc_gdata; /* used to store the "good" version */
118 int rc_error; /* I/O error for this device */
119 uint8_t rc_tried; /* Did we attempt this I/O column? */
120 uint8_t rc_skipped; /* Did we skip this I/O column? */
121 } raidz_col_t;
123 typedef struct raidz_map {
124 uint64_t rm_cols; /* Regular column count */
125 uint64_t rm_scols; /* Count including skipped columns */
126 uint64_t rm_bigcols; /* Number of oversized columns */
127 uint64_t rm_asize; /* Actual total I/O size */
128 uint64_t rm_missingdata; /* Count of missing data devices */
129 uint64_t rm_missingparity; /* Count of missing parity devices */
130 uint64_t rm_firstdatacol; /* First data column/parity count */
131 uint64_t rm_nskip; /* Skipped sectors for padding */
132 uint64_t rm_skipstart; /* Column index of padding start */
133 abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */
134 uintptr_t rm_reports; /* # of referencing checksum reports */
135 uint8_t rm_freed; /* map no longer has referencing ZIO */
136 uint8_t rm_ecksuminjected; /* checksum error was injected */
137 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
138 } raidz_map_t;
140 #define VDEV_RAIDZ_P 0
141 #define VDEV_RAIDZ_Q 1
142 #define VDEV_RAIDZ_R 2
144 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
145 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
148 * We provide a mechanism to perform the field multiplication operation on a
149 * 64-bit value all at once rather than a byte at a time. This works by
150 * creating a mask from the top bit in each byte and using that to
151 * conditionally apply the XOR of 0x1d.
153 #define VDEV_RAIDZ_64MUL_2(x, mask) \
155 (mask) = (x) & 0x8080808080808080ULL; \
156 (mask) = ((mask) << 1) - ((mask) >> 7); \
157 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
158 ((mask) & 0x1d1d1d1d1d1d1d1d); \
161 #define VDEV_RAIDZ_64MUL_4(x, mask) \
163 VDEV_RAIDZ_64MUL_2((x), mask); \
164 VDEV_RAIDZ_64MUL_2((x), mask); \
167 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
170 * Force reconstruction to use the general purpose method.
172 int vdev_raidz_default_to_general;
174 /* Powers of 2 in the Galois field defined above. */
175 static const uint8_t vdev_raidz_pow2[256] = {
176 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
177 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
178 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
179 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
180 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
181 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
182 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
183 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
184 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
185 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
186 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
187 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
188 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
189 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
190 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
191 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
192 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
193 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
194 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
195 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
196 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
197 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
198 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
199 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
200 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
201 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
202 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
203 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
204 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
205 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
206 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
207 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
209 /* Logs of 2 in the Galois field defined above. */
210 static const uint8_t vdev_raidz_log2[256] = {
211 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
212 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
213 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
214 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
215 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
216 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
217 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
218 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
219 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
220 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
221 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
222 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
223 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
224 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
225 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
226 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
227 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
228 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
229 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
230 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
231 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
232 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
233 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
234 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
235 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
236 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
237 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
238 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
239 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
240 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
241 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
242 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
245 static void vdev_raidz_generate_parity(raidz_map_t *rm);
248 * Multiply a given number by 2 raised to the given power.
250 static uint8_t
251 vdev_raidz_exp2(uint_t a, int exp)
253 if (a == 0)
254 return (0);
256 ASSERT(exp >= 0);
257 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
259 exp += vdev_raidz_log2[a];
260 if (exp > 255)
261 exp -= 255;
263 return (vdev_raidz_pow2[exp]);
266 static void
267 vdev_raidz_map_free(raidz_map_t *rm)
269 int c;
270 size_t size;
272 for (c = 0; c < rm->rm_firstdatacol; c++) {
273 abd_free(rm->rm_col[c].rc_abd);
275 if (rm->rm_col[c].rc_gdata != NULL)
276 zio_buf_free(rm->rm_col[c].rc_gdata,
277 rm->rm_col[c].rc_size);
280 size = 0;
281 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
282 abd_put(rm->rm_col[c].rc_abd);
283 size += rm->rm_col[c].rc_size;
286 if (rm->rm_abd_copy != NULL)
287 abd_free(rm->rm_abd_copy);
289 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
292 static void
293 vdev_raidz_map_free_vsd(zio_t *zio)
295 raidz_map_t *rm = zio->io_vsd;
297 ASSERT0(rm->rm_freed);
298 rm->rm_freed = 1;
300 if (rm->rm_reports == 0)
301 vdev_raidz_map_free(rm);
304 /*ARGSUSED*/
305 static void
306 vdev_raidz_cksum_free(void *arg, size_t ignored)
308 raidz_map_t *rm = arg;
310 ASSERT3U(rm->rm_reports, >, 0);
312 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
313 vdev_raidz_map_free(rm);
316 static void
317 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
319 raidz_map_t *rm = zcr->zcr_cbdata;
320 size_t c = zcr->zcr_cbinfo;
321 size_t x;
323 const char *good = NULL;
324 char *bad;
326 if (good_data == NULL) {
327 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
328 return;
331 if (c < rm->rm_firstdatacol) {
333 * The first time through, calculate the parity blocks for
334 * the good data (this relies on the fact that the good
335 * data never changes for a given logical ZIO)
337 if (rm->rm_col[0].rc_gdata == NULL) {
338 abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
339 char *buf;
340 int offset;
343 * Set up the rm_col[]s to generate the parity for
344 * good_data, first saving the parity bufs and
345 * replacing them with buffers to hold the result.
347 for (x = 0; x < rm->rm_firstdatacol; x++) {
348 bad_parity[x] = rm->rm_col[x].rc_abd;
349 rm->rm_col[x].rc_gdata =
350 zio_buf_alloc(rm->rm_col[x].rc_size);
351 rm->rm_col[x].rc_abd =
352 abd_get_from_buf(rm->rm_col[x].rc_gdata,
353 rm->rm_col[x].rc_size);
356 /* fill in the data columns from good_data */
357 buf = (char *)good_data;
358 for (; x < rm->rm_cols; x++) {
359 abd_put(rm->rm_col[x].rc_abd);
360 rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
361 rm->rm_col[x].rc_size);
362 buf += rm->rm_col[x].rc_size;
366 * Construct the parity from the good data.
368 vdev_raidz_generate_parity(rm);
370 /* restore everything back to its original state */
371 for (x = 0; x < rm->rm_firstdatacol; x++) {
372 abd_put(rm->rm_col[x].rc_abd);
373 rm->rm_col[x].rc_abd = bad_parity[x];
376 offset = 0;
377 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
378 abd_put(rm->rm_col[x].rc_abd);
379 rm->rm_col[x].rc_abd = abd_get_offset(
380 rm->rm_abd_copy, offset);
381 offset += rm->rm_col[x].rc_size;
385 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
386 good = rm->rm_col[c].rc_gdata;
387 } else {
388 /* adjust good_data to point at the start of our column */
389 good = good_data;
391 for (x = rm->rm_firstdatacol; x < c; x++)
392 good += rm->rm_col[x].rc_size;
395 bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
396 /* we drop the ereport if it ends up that the data was good */
397 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
398 abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
402 * Invoked indirectly by zfs_ereport_start_checksum(), called
403 * below when our read operation fails completely. The main point
404 * is to keep a copy of everything we read from disk, so that at
405 * vdev_raidz_cksum_finish() time we can compare it with the good data.
407 static void
408 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
410 size_t c = (size_t)(uintptr_t)arg;
411 size_t offset;
413 raidz_map_t *rm = zio->io_vsd;
414 size_t size;
416 /* set up the report and bump the refcount */
417 zcr->zcr_cbdata = rm;
418 zcr->zcr_cbinfo = c;
419 zcr->zcr_finish = vdev_raidz_cksum_finish;
420 zcr->zcr_free = vdev_raidz_cksum_free;
422 rm->rm_reports++;
423 ASSERT3U(rm->rm_reports, >, 0);
425 if (rm->rm_abd_copy != NULL)
426 return;
429 * It's the first time we're called for this raidz_map_t, so we need
430 * to copy the data aside; there's no guarantee that our zio's buffer
431 * won't be re-used for something else.
433 * Our parity data is already in separate buffers, so there's no need
434 * to copy them.
437 size = 0;
438 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
439 size += rm->rm_col[c].rc_size;
441 rm->rm_abd_copy =
442 abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
444 for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
445 raidz_col_t *col = &rm->rm_col[c];
446 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
448 abd_copy(tmp, col->rc_abd, col->rc_size);
449 abd_put(col->rc_abd);
450 col->rc_abd = tmp;
452 offset += col->rc_size;
454 ASSERT3U(offset, ==, size);
457 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
458 vdev_raidz_map_free_vsd,
459 vdev_raidz_cksum_report
463 * Divides the IO evenly across all child vdevs; usually, dcols is
464 * the number of children in the target vdev.
466 static raidz_map_t *
467 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
468 uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
470 raidz_map_t *rm;
471 /* The starting RAIDZ (parent) vdev sector of the block. */
472 uint64_t b = offset >> unit_shift;
473 /* The zio's size in units of the vdev's minimum sector size. */
474 uint64_t s = size >> unit_shift;
475 /* The first column for this stripe. */
476 uint64_t f = b % dcols;
477 /* The starting byte offset on each child vdev. */
478 uint64_t o = (b / dcols) << unit_shift;
479 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
480 uint64_t off = 0;
483 * "Quotient": The number of data sectors for this stripe on all but
484 * the "big column" child vdevs that also contain "remainder" data.
486 q = s / (dcols - nparity);
489 * "Remainder": The number of partial stripe data sectors in this I/O.
490 * This will add a sector to some, but not all, child vdevs.
492 r = s - q * (dcols - nparity);
494 /* The number of "big columns" - those which contain remainder data. */
495 bc = (r == 0 ? 0 : r + nparity);
498 * The total number of data and parity sectors associated with
499 * this I/O.
501 tot = s + nparity * (q + (r == 0 ? 0 : 1));
503 /* acols: The columns that will be accessed. */
504 /* scols: The columns that will be accessed or skipped. */
505 if (q == 0) {
506 /* Our I/O request doesn't span all child vdevs. */
507 acols = bc;
508 scols = MIN(dcols, roundup(bc, nparity + 1));
509 } else {
510 acols = dcols;
511 scols = dcols;
514 ASSERT3U(acols, <=, scols);
516 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
518 rm->rm_cols = acols;
519 rm->rm_scols = scols;
520 rm->rm_bigcols = bc;
521 rm->rm_skipstart = bc;
522 rm->rm_missingdata = 0;
523 rm->rm_missingparity = 0;
524 rm->rm_firstdatacol = nparity;
525 rm->rm_abd_copy = NULL;
526 rm->rm_reports = 0;
527 rm->rm_freed = 0;
528 rm->rm_ecksuminjected = 0;
530 asize = 0;
532 for (c = 0; c < scols; c++) {
533 col = f + c;
534 coff = o;
535 if (col >= dcols) {
536 col -= dcols;
537 coff += 1ULL << unit_shift;
539 rm->rm_col[c].rc_devidx = col;
540 rm->rm_col[c].rc_offset = coff;
541 rm->rm_col[c].rc_abd = NULL;
542 rm->rm_col[c].rc_gdata = NULL;
543 rm->rm_col[c].rc_error = 0;
544 rm->rm_col[c].rc_tried = 0;
545 rm->rm_col[c].rc_skipped = 0;
547 if (c >= acols)
548 rm->rm_col[c].rc_size = 0;
549 else if (c < bc)
550 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
551 else
552 rm->rm_col[c].rc_size = q << unit_shift;
554 asize += rm->rm_col[c].rc_size;
557 ASSERT3U(asize, ==, tot << unit_shift);
558 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
559 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
560 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
561 ASSERT3U(rm->rm_nskip, <=, nparity);
563 for (c = 0; c < rm->rm_firstdatacol; c++)
564 rm->rm_col[c].rc_abd =
565 abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
567 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
568 off = rm->rm_col[c].rc_size;
570 for (c = c + 1; c < acols; c++) {
571 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
572 off += rm->rm_col[c].rc_size;
576 * If all data stored spans all columns, there's a danger that parity
577 * will always be on the same device and, since parity isn't read
578 * during normal operation, that that device's I/O bandwidth won't be
579 * used effectively. We therefore switch the parity every 1MB.
581 * ... at least that was, ostensibly, the theory. As a practical
582 * matter unless we juggle the parity between all devices evenly, we
583 * won't see any benefit. Further, occasional writes that aren't a
584 * multiple of the LCM of the number of children and the minimum
585 * stripe width are sufficient to avoid pessimal behavior.
586 * Unfortunately, this decision created an implicit on-disk format
587 * requirement that we need to support for all eternity, but only
588 * for single-parity RAID-Z.
590 * If we intend to skip a sector in the zeroth column for padding
591 * we must make sure to note this swap. We will never intend to
592 * skip the first column since at least one data and one parity
593 * column must appear in each row.
595 ASSERT(rm->rm_cols >= 2);
596 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
598 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
599 devidx = rm->rm_col[0].rc_devidx;
600 o = rm->rm_col[0].rc_offset;
601 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
602 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
603 rm->rm_col[1].rc_devidx = devidx;
604 rm->rm_col[1].rc_offset = o;
606 if (rm->rm_skipstart == 0)
607 rm->rm_skipstart = 1;
610 return (rm);
613 struct pqr_struct {
614 uint64_t *p;
615 uint64_t *q;
616 uint64_t *r;
619 static int
620 vdev_raidz_p_func(void *buf, size_t size, void *private)
622 struct pqr_struct *pqr = private;
623 const uint64_t *src = buf;
624 int i, cnt = size / sizeof (src[0]);
626 ASSERT(pqr->p && !pqr->q && !pqr->r);
628 for (i = 0; i < cnt; i++, src++, pqr->p++)
629 *pqr->p ^= *src;
631 return (0);
634 static int
635 vdev_raidz_pq_func(void *buf, size_t size, void *private)
637 struct pqr_struct *pqr = private;
638 const uint64_t *src = buf;
639 uint64_t mask;
640 int i, cnt = size / sizeof (src[0]);
642 ASSERT(pqr->p && pqr->q && !pqr->r);
644 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
645 *pqr->p ^= *src;
646 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
647 *pqr->q ^= *src;
650 return (0);
653 static int
654 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
656 struct pqr_struct *pqr = private;
657 const uint64_t *src = buf;
658 uint64_t mask;
659 int i, cnt = size / sizeof (src[0]);
661 ASSERT(pqr->p && pqr->q && pqr->r);
663 for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
664 *pqr->p ^= *src;
665 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
666 *pqr->q ^= *src;
667 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
668 *pqr->r ^= *src;
671 return (0);
674 static void
675 vdev_raidz_generate_parity_p(raidz_map_t *rm)
677 uint64_t *p;
678 int c;
679 abd_t *src;
681 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
682 src = rm->rm_col[c].rc_abd;
683 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
685 if (c == rm->rm_firstdatacol) {
686 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
687 } else {
688 struct pqr_struct pqr = { p, NULL, NULL };
689 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
690 vdev_raidz_p_func, &pqr);
695 static void
696 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
698 uint64_t *p, *q, pcnt, ccnt, mask, i;
699 int c;
700 abd_t *src;
702 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
703 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
704 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
706 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
707 src = rm->rm_col[c].rc_abd;
708 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
709 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
711 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
713 if (c == rm->rm_firstdatacol) {
714 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
715 (void) memcpy(q, p, rm->rm_col[c].rc_size);
716 } else {
717 struct pqr_struct pqr = { p, q, NULL };
718 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
719 vdev_raidz_pq_func, &pqr);
722 if (c == rm->rm_firstdatacol) {
723 for (i = ccnt; i < pcnt; i++) {
724 p[i] = 0;
725 q[i] = 0;
727 } else {
729 * Treat short columns as though they are full of 0s.
730 * Note that there's therefore nothing needed for P.
732 for (i = ccnt; i < pcnt; i++) {
733 VDEV_RAIDZ_64MUL_2(q[i], mask);
739 static void
740 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
742 uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
743 int c;
744 abd_t *src;
746 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
747 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
748 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
749 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
750 rm->rm_col[VDEV_RAIDZ_R].rc_size);
752 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
753 src = rm->rm_col[c].rc_abd;
754 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
755 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
756 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
758 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
760 if (c == rm->rm_firstdatacol) {
761 abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
762 (void) memcpy(q, p, rm->rm_col[c].rc_size);
763 (void) memcpy(r, p, rm->rm_col[c].rc_size);
764 } else {
765 struct pqr_struct pqr = { p, q, r };
766 (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
767 vdev_raidz_pqr_func, &pqr);
770 if (c == rm->rm_firstdatacol) {
771 for (i = ccnt; i < pcnt; i++) {
772 p[i] = 0;
773 q[i] = 0;
774 r[i] = 0;
776 } else {
778 * Treat short columns as though they are full of 0s.
779 * Note that there's therefore nothing needed for P.
781 for (i = ccnt; i < pcnt; i++) {
782 VDEV_RAIDZ_64MUL_2(q[i], mask);
783 VDEV_RAIDZ_64MUL_4(r[i], mask);
790 * Generate RAID parity in the first virtual columns according to the number of
791 * parity columns available.
793 static void
794 vdev_raidz_generate_parity(raidz_map_t *rm)
796 switch (rm->rm_firstdatacol) {
797 case 1:
798 vdev_raidz_generate_parity_p(rm);
799 break;
800 case 2:
801 vdev_raidz_generate_parity_pq(rm);
802 break;
803 case 3:
804 vdev_raidz_generate_parity_pqr(rm);
805 break;
806 default:
807 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
811 /* ARGSUSED */
812 static int
813 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
815 uint64_t *dst = dbuf;
816 uint64_t *src = sbuf;
817 int cnt = size / sizeof (src[0]);
819 for (int i = 0; i < cnt; i++) {
820 dst[i] ^= src[i];
823 return (0);
826 /* ARGSUSED */
827 static int
828 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
829 void *private)
831 uint64_t *dst = dbuf;
832 uint64_t *src = sbuf;
833 uint64_t mask;
834 int cnt = size / sizeof (dst[0]);
836 for (int i = 0; i < cnt; i++, dst++, src++) {
837 VDEV_RAIDZ_64MUL_2(*dst, mask);
838 *dst ^= *src;
841 return (0);
844 /* ARGSUSED */
845 static int
846 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
848 uint64_t *dst = buf;
849 uint64_t mask;
850 int cnt = size / sizeof (dst[0]);
852 for (int i = 0; i < cnt; i++, dst++) {
853 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
854 VDEV_RAIDZ_64MUL_2(*dst, mask);
857 return (0);
860 struct reconst_q_struct {
861 uint64_t *q;
862 int exp;
865 static int
866 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
868 struct reconst_q_struct *rq = private;
869 uint64_t *dst = buf;
870 int cnt = size / sizeof (dst[0]);
872 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
873 *dst ^= *rq->q;
875 int j;
876 uint8_t *b;
877 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
878 *b = vdev_raidz_exp2(*b, rq->exp);
882 return (0);
885 struct reconst_pq_struct {
886 uint8_t *p;
887 uint8_t *q;
888 uint8_t *pxy;
889 uint8_t *qxy;
890 int aexp;
891 int bexp;
894 static int
895 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
897 struct reconst_pq_struct *rpq = private;
898 uint8_t *xd = xbuf;
899 uint8_t *yd = ybuf;
901 for (int i = 0; i < size;
902 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
903 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
904 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
905 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
908 return (0);
911 static int
912 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
914 struct reconst_pq_struct *rpq = private;
915 uint8_t *xd = xbuf;
917 for (int i = 0; i < size;
918 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
919 /* same operation as vdev_raidz_reconst_pq_func() on xd */
920 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
921 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
924 return (0);
927 static int
928 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
930 int x = tgts[0];
931 int c;
932 abd_t *dst, *src;
934 ASSERT(ntgts == 1);
935 ASSERT(x >= rm->rm_firstdatacol);
936 ASSERT(x < rm->rm_cols);
938 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
939 ASSERT(rm->rm_col[x].rc_size > 0);
941 src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
942 dst = rm->rm_col[x].rc_abd;
944 abd_copy(dst, src, rm->rm_col[x].rc_size);
946 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
947 uint64_t size = MIN(rm->rm_col[x].rc_size,
948 rm->rm_col[c].rc_size);
950 src = rm->rm_col[c].rc_abd;
951 dst = rm->rm_col[x].rc_abd;
953 if (c == x)
954 continue;
956 (void) abd_iterate_func2(dst, src, 0, 0, size,
957 vdev_raidz_reconst_p_func, NULL);
960 return (1 << VDEV_RAIDZ_P);
963 static int
964 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
966 int x = tgts[0];
967 int c, exp;
968 abd_t *dst, *src;
970 ASSERT(ntgts == 1);
972 ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
974 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
975 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
976 rm->rm_col[c].rc_size);
978 src = rm->rm_col[c].rc_abd;
979 dst = rm->rm_col[x].rc_abd;
981 if (c == rm->rm_firstdatacol) {
982 abd_copy(dst, src, size);
983 if (rm->rm_col[x].rc_size > size)
984 abd_zero_off(dst, size,
985 rm->rm_col[x].rc_size - size);
986 } else {
987 ASSERT3U(size, <=, rm->rm_col[x].rc_size);
988 (void) abd_iterate_func2(dst, src, 0, 0, size,
989 vdev_raidz_reconst_q_pre_func, NULL);
990 (void) abd_iterate_func(dst,
991 size, rm->rm_col[x].rc_size - size,
992 vdev_raidz_reconst_q_pre_tail_func, NULL);
996 src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
997 dst = rm->rm_col[x].rc_abd;
998 exp = 255 - (rm->rm_cols - 1 - x);
1000 struct reconst_q_struct rq = { abd_to_buf(src), exp };
1001 (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1002 vdev_raidz_reconst_q_post_func, &rq);
1004 return (1 << VDEV_RAIDZ_Q);
1007 static int
1008 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1010 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1011 abd_t *pdata, *qdata;
1012 uint64_t xsize, ysize;
1013 int x = tgts[0];
1014 int y = tgts[1];
1015 abd_t *xd, *yd;
1017 ASSERT(ntgts == 2);
1018 ASSERT(x < y);
1019 ASSERT(x >= rm->rm_firstdatacol);
1020 ASSERT(y < rm->rm_cols);
1022 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1025 * Move the parity data aside -- we're going to compute parity as
1026 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1027 * reuse the parity generation mechanism without trashing the actual
1028 * parity so we make those columns appear to be full of zeros by
1029 * setting their lengths to zero.
1031 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1032 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1033 xsize = rm->rm_col[x].rc_size;
1034 ysize = rm->rm_col[y].rc_size;
1036 rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1037 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1038 rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1039 abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1040 rm->rm_col[x].rc_size = 0;
1041 rm->rm_col[y].rc_size = 0;
1043 vdev_raidz_generate_parity_pq(rm);
1045 rm->rm_col[x].rc_size = xsize;
1046 rm->rm_col[y].rc_size = ysize;
1048 p = abd_to_buf(pdata);
1049 q = abd_to_buf(qdata);
1050 pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1051 qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1052 xd = rm->rm_col[x].rc_abd;
1053 yd = rm->rm_col[y].rc_abd;
1056 * We now have:
1057 * Pxy = P + D_x + D_y
1058 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1060 * We can then solve for D_x:
1061 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1062 * where
1063 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1064 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1066 * With D_x in hand, we can easily solve for D_y:
1067 * D_y = P + Pxy + D_x
1070 a = vdev_raidz_pow2[255 + x - y];
1071 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1072 tmp = 255 - vdev_raidz_log2[a ^ 1];
1074 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1075 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1077 ASSERT3U(xsize, >=, ysize);
1078 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1079 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1080 vdev_raidz_reconst_pq_func, &rpq);
1081 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1082 vdev_raidz_reconst_pq_tail_func, &rpq);
1084 abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1085 abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1088 * Restore the saved parity data.
1090 rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1091 rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1093 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1096 /* BEGIN CSTYLED */
1098 * In the general case of reconstruction, we must solve the system of linear
1099 * equations defined by the coeffecients used to generate parity as well as
1100 * the contents of the data and parity disks. This can be expressed with
1101 * vectors for the original data (D) and the actual data (d) and parity (p)
1102 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1104 * __ __ __ __
1105 * | | __ __ | p_0 |
1106 * | V | | D_0 | | p_m-1 |
1107 * | | x | : | = | d_0 |
1108 * | I | | D_n-1 | | : |
1109 * | | ~~ ~~ | d_n-1 |
1110 * ~~ ~~ ~~ ~~
1112 * I is simply a square identity matrix of size n, and V is a vandermonde
1113 * matrix defined by the coeffecients we chose for the various parity columns
1114 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1115 * computation as well as linear separability.
1117 * __ __ __ __
1118 * | 1 .. 1 1 1 | | p_0 |
1119 * | 2^n-1 .. 4 2 1 | __ __ | : |
1120 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1121 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1122 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1123 * | : : : : | | : | | d_2 |
1124 * | 0 .. 1 0 0 | | D_n-1 | | : |
1125 * | 0 .. 0 1 0 | ~~ ~~ | : |
1126 * | 0 .. 0 0 1 | | d_n-1 |
1127 * ~~ ~~ ~~ ~~
1129 * Note that I, V, d, and p are known. To compute D, we must invert the
1130 * matrix and use the known data and parity values to reconstruct the unknown
1131 * data values. We begin by removing the rows in V|I and d|p that correspond
1132 * to failed or missing columns; we then make V|I square (n x n) and d|p
1133 * sized n by removing rows corresponding to unused parity from the bottom up
1134 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1135 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1136 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1137 * __ __
1138 * | 1 1 1 1 1 1 1 1 |
1139 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1140 * | 19 205 116 29 64 16 4 1 | / /
1141 * | 1 0 0 0 0 0 0 0 | / /
1142 * | 0 1 0 0 0 0 0 0 | <--' /
1143 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1144 * | 0 0 0 1 0 0 0 0 |
1145 * | 0 0 0 0 1 0 0 0 |
1146 * | 0 0 0 0 0 1 0 0 |
1147 * | 0 0 0 0 0 0 1 0 |
1148 * | 0 0 0 0 0 0 0 1 |
1149 * ~~ ~~
1150 * __ __
1151 * | 1 1 1 1 1 1 1 1 |
1152 * | 19 205 116 29 64 16 4 1 |
1153 * | 1 0 0 0 0 0 0 0 |
1154 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1155 * | 0 0 0 0 1 0 0 0 |
1156 * | 0 0 0 0 0 1 0 0 |
1157 * | 0 0 0 0 0 0 1 0 |
1158 * | 0 0 0 0 0 0 0 1 |
1159 * ~~ ~~
1161 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1162 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1163 * matrix is not singular.
1164 * __ __
1165 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1166 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1167 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1168 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1169 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1170 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1171 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1172 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1173 * ~~ ~~
1174 * __ __
1175 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1176 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1177 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1178 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1179 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1180 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1181 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1182 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1183 * ~~ ~~
1184 * __ __
1185 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1186 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1187 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1188 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1189 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1190 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1191 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1192 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1193 * ~~ ~~
1194 * __ __
1195 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1196 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1197 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1198 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1199 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1200 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1201 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1202 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1203 * ~~ ~~
1204 * __ __
1205 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1206 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1207 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1208 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1209 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1210 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1211 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1212 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1213 * ~~ ~~
1214 * __ __
1215 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1216 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1217 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1218 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1219 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1220 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1221 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1222 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1223 * ~~ ~~
1224 * __ __
1225 * | 0 0 1 0 0 0 0 0 |
1226 * | 167 100 5 41 159 169 217 208 |
1227 * | 166 100 4 40 158 168 216 209 |
1228 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1229 * | 0 0 0 0 1 0 0 0 |
1230 * | 0 0 0 0 0 1 0 0 |
1231 * | 0 0 0 0 0 0 1 0 |
1232 * | 0 0 0 0 0 0 0 1 |
1233 * ~~ ~~
1235 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1236 * of the missing data.
1238 * As is apparent from the example above, the only non-trivial rows in the
1239 * inverse matrix correspond to the data disks that we're trying to
1240 * reconstruct. Indeed, those are the only rows we need as the others would
1241 * only be useful for reconstructing data known or assumed to be valid. For
1242 * that reason, we only build the coefficients in the rows that correspond to
1243 * targeted columns.
1245 /* END CSTYLED */
1247 static void
1248 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1249 uint8_t **rows)
1251 int i, j;
1252 int pow;
1254 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1257 * Fill in the missing rows of interest.
1259 for (i = 0; i < nmap; i++) {
1260 ASSERT3S(0, <=, map[i]);
1261 ASSERT3S(map[i], <=, 2);
1263 pow = map[i] * n;
1264 if (pow > 255)
1265 pow -= 255;
1266 ASSERT(pow <= 255);
1268 for (j = 0; j < n; j++) {
1269 pow -= map[i];
1270 if (pow < 0)
1271 pow += 255;
1272 rows[i][j] = vdev_raidz_pow2[pow];
1277 static void
1278 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1279 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1281 int i, j, ii, jj;
1282 uint8_t log;
1285 * Assert that the first nmissing entries from the array of used
1286 * columns correspond to parity columns and that subsequent entries
1287 * correspond to data columns.
1289 for (i = 0; i < nmissing; i++) {
1290 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1292 for (; i < n; i++) {
1293 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1297 * First initialize the storage where we'll compute the inverse rows.
1299 for (i = 0; i < nmissing; i++) {
1300 for (j = 0; j < n; j++) {
1301 invrows[i][j] = (i == j) ? 1 : 0;
1306 * Subtract all trivial rows from the rows of consequence.
1308 for (i = 0; i < nmissing; i++) {
1309 for (j = nmissing; j < n; j++) {
1310 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1311 jj = used[j] - rm->rm_firstdatacol;
1312 ASSERT3S(jj, <, n);
1313 invrows[i][j] = rows[i][jj];
1314 rows[i][jj] = 0;
1319 * For each of the rows of interest, we must normalize it and subtract
1320 * a multiple of it from the other rows.
1322 for (i = 0; i < nmissing; i++) {
1323 for (j = 0; j < missing[i]; j++) {
1324 ASSERT0(rows[i][j]);
1326 ASSERT3U(rows[i][missing[i]], !=, 0);
1329 * Compute the inverse of the first element and multiply each
1330 * element in the row by that value.
1332 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1334 for (j = 0; j < n; j++) {
1335 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1336 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1339 for (ii = 0; ii < nmissing; ii++) {
1340 if (i == ii)
1341 continue;
1343 ASSERT3U(rows[ii][missing[i]], !=, 0);
1345 log = vdev_raidz_log2[rows[ii][missing[i]]];
1347 for (j = 0; j < n; j++) {
1348 rows[ii][j] ^=
1349 vdev_raidz_exp2(rows[i][j], log);
1350 invrows[ii][j] ^=
1351 vdev_raidz_exp2(invrows[i][j], log);
1357 * Verify that the data that is left in the rows are properly part of
1358 * an identity matrix.
1360 for (i = 0; i < nmissing; i++) {
1361 for (j = 0; j < n; j++) {
1362 if (j == missing[i]) {
1363 ASSERT3U(rows[i][j], ==, 1);
1364 } else {
1365 ASSERT0(rows[i][j]);
1371 static void
1372 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1373 int *missing, uint8_t **invrows, const uint8_t *used)
1375 int i, j, x, cc, c;
1376 uint8_t *src;
1377 uint64_t ccount;
1378 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1379 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1380 uint8_t log = 0;
1381 uint8_t val;
1382 int ll;
1383 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1384 uint8_t *p, *pp;
1385 size_t psize;
1387 psize = sizeof (invlog[0][0]) * n * nmissing;
1388 p = kmem_alloc(psize, KM_SLEEP);
1390 for (pp = p, i = 0; i < nmissing; i++) {
1391 invlog[i] = pp;
1392 pp += n;
1395 for (i = 0; i < nmissing; i++) {
1396 for (j = 0; j < n; j++) {
1397 ASSERT3U(invrows[i][j], !=, 0);
1398 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1402 for (i = 0; i < n; i++) {
1403 c = used[i];
1404 ASSERT3U(c, <, rm->rm_cols);
1406 src = abd_to_buf(rm->rm_col[c].rc_abd);
1407 ccount = rm->rm_col[c].rc_size;
1408 for (j = 0; j < nmissing; j++) {
1409 cc = missing[j] + rm->rm_firstdatacol;
1410 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1411 ASSERT3U(cc, <, rm->rm_cols);
1412 ASSERT3U(cc, !=, c);
1414 dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1415 dcount[j] = rm->rm_col[cc].rc_size;
1418 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1420 for (x = 0; x < ccount; x++, src++) {
1421 if (*src != 0)
1422 log = vdev_raidz_log2[*src];
1424 for (cc = 0; cc < nmissing; cc++) {
1425 if (x >= dcount[cc])
1426 continue;
1428 if (*src == 0) {
1429 val = 0;
1430 } else {
1431 if ((ll = log + invlog[cc][i]) >= 255)
1432 ll -= 255;
1433 val = vdev_raidz_pow2[ll];
1436 if (i == 0)
1437 dst[cc][x] = val;
1438 else
1439 dst[cc][x] ^= val;
1444 kmem_free(p, psize);
1447 static int
1448 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1450 int n, i, c, t, tt;
1451 int nmissing_rows;
1452 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1453 int parity_map[VDEV_RAIDZ_MAXPARITY];
1455 uint8_t *p, *pp;
1456 size_t psize;
1458 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1459 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1460 uint8_t *used;
1462 abd_t **bufs = NULL;
1464 int code = 0;
1467 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1468 * temporary linear ABDs.
1470 if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1471 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1473 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1474 raidz_col_t *col = &rm->rm_col[c];
1476 bufs[c] = col->rc_abd;
1477 col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1478 abd_copy(col->rc_abd, bufs[c], col->rc_size);
1482 n = rm->rm_cols - rm->rm_firstdatacol;
1485 * Figure out which data columns are missing.
1487 nmissing_rows = 0;
1488 for (t = 0; t < ntgts; t++) {
1489 if (tgts[t] >= rm->rm_firstdatacol) {
1490 missing_rows[nmissing_rows++] =
1491 tgts[t] - rm->rm_firstdatacol;
1496 * Figure out which parity columns to use to help generate the missing
1497 * data columns.
1499 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1500 ASSERT(tt < ntgts);
1501 ASSERT(c < rm->rm_firstdatacol);
1504 * Skip any targeted parity columns.
1506 if (c == tgts[tt]) {
1507 tt++;
1508 continue;
1511 code |= 1 << c;
1513 parity_map[i] = c;
1514 i++;
1517 ASSERT(code != 0);
1518 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1520 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1521 nmissing_rows * n + sizeof (used[0]) * n;
1522 p = kmem_alloc(psize, KM_SLEEP);
1524 for (pp = p, i = 0; i < nmissing_rows; i++) {
1525 rows[i] = pp;
1526 pp += n;
1527 invrows[i] = pp;
1528 pp += n;
1530 used = pp;
1532 for (i = 0; i < nmissing_rows; i++) {
1533 used[i] = parity_map[i];
1536 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1537 if (tt < nmissing_rows &&
1538 c == missing_rows[tt] + rm->rm_firstdatacol) {
1539 tt++;
1540 continue;
1543 ASSERT3S(i, <, n);
1544 used[i] = c;
1545 i++;
1549 * Initialize the interesting rows of the matrix.
1551 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1554 * Invert the matrix.
1556 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1557 invrows, used);
1560 * Reconstruct the missing data using the generated matrix.
1562 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1563 invrows, used);
1565 kmem_free(p, psize);
1568 * copy back from temporary linear abds and free them
1570 if (bufs) {
1571 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1572 raidz_col_t *col = &rm->rm_col[c];
1574 abd_copy(bufs[c], col->rc_abd, col->rc_size);
1575 abd_free(col->rc_abd);
1576 col->rc_abd = bufs[c];
1578 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1581 return (code);
1584 static int
1585 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1587 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1588 int ntgts;
1589 int i, c;
1590 int code;
1591 int nbadparity, nbaddata;
1592 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1595 * The tgts list must already be sorted.
1597 for (i = 1; i < nt; i++) {
1598 ASSERT(t[i] > t[i - 1]);
1601 nbadparity = rm->rm_firstdatacol;
1602 nbaddata = rm->rm_cols - nbadparity;
1603 ntgts = 0;
1604 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1605 if (c < rm->rm_firstdatacol)
1606 parity_valid[c] = B_FALSE;
1608 if (i < nt && c == t[i]) {
1609 tgts[ntgts++] = c;
1610 i++;
1611 } else if (rm->rm_col[c].rc_error != 0) {
1612 tgts[ntgts++] = c;
1613 } else if (c >= rm->rm_firstdatacol) {
1614 nbaddata--;
1615 } else {
1616 parity_valid[c] = B_TRUE;
1617 nbadparity--;
1621 ASSERT(ntgts >= nt);
1622 ASSERT(nbaddata >= 0);
1623 ASSERT(nbaddata + nbadparity == ntgts);
1625 dt = &tgts[nbadparity];
1628 * See if we can use any of our optimized reconstruction routines.
1630 if (!vdev_raidz_default_to_general) {
1631 switch (nbaddata) {
1632 case 1:
1633 if (parity_valid[VDEV_RAIDZ_P])
1634 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1636 ASSERT(rm->rm_firstdatacol > 1);
1638 if (parity_valid[VDEV_RAIDZ_Q])
1639 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1641 ASSERT(rm->rm_firstdatacol > 2);
1642 break;
1644 case 2:
1645 ASSERT(rm->rm_firstdatacol > 1);
1647 if (parity_valid[VDEV_RAIDZ_P] &&
1648 parity_valid[VDEV_RAIDZ_Q])
1649 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1651 ASSERT(rm->rm_firstdatacol > 2);
1653 break;
1657 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1658 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1659 ASSERT(code > 0);
1660 return (code);
1663 static int
1664 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1665 uint64_t *ashift)
1667 vdev_t *cvd;
1668 uint64_t nparity = vd->vdev_nparity;
1669 int c;
1670 int lasterror = 0;
1671 int numerrors = 0;
1673 ASSERT(nparity > 0);
1675 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1676 vd->vdev_children < nparity + 1) {
1677 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1678 return (SET_ERROR(EINVAL));
1681 vdev_open_children(vd);
1683 for (c = 0; c < vd->vdev_children; c++) {
1684 cvd = vd->vdev_child[c];
1686 if (cvd->vdev_open_error != 0) {
1687 lasterror = cvd->vdev_open_error;
1688 numerrors++;
1689 continue;
1692 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1693 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1694 *ashift = MAX(*ashift, cvd->vdev_ashift);
1697 *asize *= vd->vdev_children;
1698 *max_asize *= vd->vdev_children;
1700 if (numerrors > nparity) {
1701 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1702 return (lasterror);
1705 return (0);
1708 static void
1709 vdev_raidz_close(vdev_t *vd)
1711 int c;
1713 for (c = 0; c < vd->vdev_children; c++)
1714 vdev_close(vd->vdev_child[c]);
1718 * Handle a read or write I/O to a RAID-Z dump device.
1720 * The dump device is in a unique situation compared to other ZFS datasets:
1721 * writing to this device should be as simple and fast as possible. In
1722 * addition, durability matters much less since the dump will be extracted
1723 * once the machine reboots. For that reason, this function eschews parity for
1724 * performance and simplicity. The dump device uses the checksum setting
1725 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1726 * dataset.
1728 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1729 * 128 KB will not fill an entire block; in addition, they may not be properly
1730 * aligned. In that case, this function uses the preallocated 128 KB block and
1731 * omits reading or writing any "empty" portions of that block, as opposed to
1732 * allocating a fresh appropriately-sized block.
1734 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1736 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1738 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1739 * allocated which spans all five child vdevs. 8 KB of data would be written to
1740 * each of four vdevs, with the fifth containing the parity bits.
1742 * parity data data data data
1743 * | PP | XX | XX | XX | XX |
1744 * ^ ^ ^ ^ ^
1745 * | | | | |
1746 * 8 KB parity ------8 KB data blocks------
1748 * However, when writing to the dump device, the behavior is different:
1750 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1752 * Unlike the normal RAID-Z case in which the block is allocated based on the
1753 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1754 * I/O size is less than 128 KB, only the actual portions of data are written.
1755 * In this example the data is written to the third data vdev since that vdev
1756 * contains the offset [64 KB, 96 KB).
1758 * parity data data data data
1759 * | | | | XX | |
1762 * 32 KB data block
1764 * As a result, an individual I/O may not span all child vdevs; moreover, a
1765 * small I/O may only operate on a single child vdev.
1767 * Note that since there are no parity bits calculated or written, this format
1768 * remains the same no matter how many parity bits are used in a normal RAID-Z
1769 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1770 * would look like:
1772 * parity parity parity data data data data
1773 * | | | | | | XX | |
1776 * 32 KB data block
1779 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1780 uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1782 vdev_t *tvd = vd->vdev_top;
1783 vdev_t *cvd;
1784 raidz_map_t *rm;
1785 raidz_col_t *rc;
1786 int c, err = 0;
1788 uint64_t start, end, colstart, colend;
1789 uint64_t coloffset, colsize, colskip;
1791 int flags = doread ? B_READ : B_WRITE;
1793 #ifdef _KERNEL
1796 * Don't write past the end of the block
1798 VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1800 start = offset;
1801 end = start + size;
1804 * Allocate a RAID-Z map for this block. Note that this block starts
1805 * from the "original" offset, this is, the offset of the extent which
1806 * contains the requisite offset of the data being read or written.
1808 * Even if this I/O operation doesn't span the full block size, let's
1809 * treat the on-disk format as if the only blocks are the complete 128
1810 * KB size.
1812 abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1813 SPA_OLD_MAXBLOCKSIZE);
1814 rm = vdev_raidz_map_alloc(abd,
1815 SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1816 vd->vdev_children, vd->vdev_nparity);
1818 coloffset = origoffset;
1820 for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1821 c++, coloffset += rc->rc_size) {
1822 rc = &rm->rm_col[c];
1823 cvd = vd->vdev_child[rc->rc_devidx];
1826 * Find the start and end of this column in the RAID-Z map,
1827 * keeping in mind that the stated size and offset of the
1828 * operation may not fill the entire column for this vdev.
1830 * If any portion of the data spans this column, issue the
1831 * appropriate operation to the vdev.
1833 if (coloffset + rc->rc_size <= start)
1834 continue;
1835 if (coloffset >= end)
1836 continue;
1838 colstart = MAX(coloffset, start);
1839 colend = MIN(end, coloffset + rc->rc_size);
1840 colsize = colend - colstart;
1841 colskip = colstart - coloffset;
1843 VERIFY3U(colsize, <=, rc->rc_size);
1844 VERIFY3U(colskip, <=, rc->rc_size);
1847 * Note that the child vdev will have a vdev label at the start
1848 * of its range of offsets, hence the need for
1849 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1850 * example of why this calculation is needed.
1852 if ((err = vdev_disk_physio(cvd,
1853 ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1854 VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1855 flags, isdump)) != 0)
1856 break;
1859 vdev_raidz_map_free(rm);
1860 abd_put(abd);
1861 #endif /* KERNEL */
1863 return (err);
1866 static uint64_t
1867 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1869 uint64_t asize;
1870 uint64_t ashift = vd->vdev_top->vdev_ashift;
1871 uint64_t cols = vd->vdev_children;
1872 uint64_t nparity = vd->vdev_nparity;
1874 asize = ((psize - 1) >> ashift) + 1;
1875 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1876 asize = roundup(asize, nparity + 1) << ashift;
1878 return (asize);
1881 static void
1882 vdev_raidz_child_done(zio_t *zio)
1884 raidz_col_t *rc = zio->io_private;
1886 rc->rc_error = zio->io_error;
1887 rc->rc_tried = 1;
1888 rc->rc_skipped = 0;
1891 static void
1892 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1894 #ifdef ZFS_DEBUG
1895 vdev_t *vd = zio->io_vd;
1896 vdev_t *tvd = vd->vdev_top;
1898 range_seg_t logical_rs, physical_rs;
1899 logical_rs.rs_start = zio->io_offset;
1900 logical_rs.rs_end = logical_rs.rs_start +
1901 vdev_raidz_asize(zio->io_vd, zio->io_size);
1903 raidz_col_t *rc = &rm->rm_col[col];
1904 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1906 vdev_xlate(cvd, &logical_rs, &physical_rs);
1907 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1908 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1910 * It would be nice to assert that rs_end is equal
1911 * to rc_offset + rc_size but there might be an
1912 * optional I/O at the end that is not accounted in
1913 * rc_size.
1915 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1916 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1917 rc->rc_size + (1 << tvd->vdev_ashift));
1918 } else {
1919 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1921 #endif
1925 * Start an IO operation on a RAIDZ VDev
1927 * Outline:
1928 * - For write operations:
1929 * 1. Generate the parity data
1930 * 2. Create child zio write operations to each column's vdev, for both
1931 * data and parity.
1932 * 3. If the column skips any sectors for padding, create optional dummy
1933 * write zio children for those areas to improve aggregation continuity.
1934 * - For read operations:
1935 * 1. Create child zio read operations to each data column's vdev to read
1936 * the range of data required for zio.
1937 * 2. If this is a scrub or resilver operation, or if any of the data
1938 * vdevs have had errors, then create zio read operations to the parity
1939 * columns' VDevs as well.
1941 static void
1942 vdev_raidz_io_start(zio_t *zio)
1944 vdev_t *vd = zio->io_vd;
1945 vdev_t *tvd = vd->vdev_top;
1946 vdev_t *cvd;
1947 raidz_map_t *rm;
1948 raidz_col_t *rc;
1949 int c, i;
1951 rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1952 tvd->vdev_ashift, vd->vdev_children,
1953 vd->vdev_nparity);
1955 zio->io_vsd = rm;
1956 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1958 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1960 if (zio->io_type == ZIO_TYPE_WRITE) {
1961 vdev_raidz_generate_parity(rm);
1963 for (c = 0; c < rm->rm_cols; c++) {
1964 rc = &rm->rm_col[c];
1965 cvd = vd->vdev_child[rc->rc_devidx];
1968 * Verify physical to logical translation.
1970 vdev_raidz_io_verify(zio, rm, c);
1972 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1973 rc->rc_offset, rc->rc_abd, rc->rc_size,
1974 zio->io_type, zio->io_priority, 0,
1975 vdev_raidz_child_done, rc));
1979 * Generate optional I/Os for any skipped sectors to improve
1980 * aggregation contiguity.
1982 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1983 ASSERT(c <= rm->rm_scols);
1984 if (c == rm->rm_scols)
1985 c = 0;
1986 rc = &rm->rm_col[c];
1987 cvd = vd->vdev_child[rc->rc_devidx];
1988 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1989 rc->rc_offset + rc->rc_size, NULL,
1990 1 << tvd->vdev_ashift,
1991 zio->io_type, zio->io_priority,
1992 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1995 zio_execute(zio);
1996 return;
1999 ASSERT(zio->io_type == ZIO_TYPE_READ);
2002 * Iterate over the columns in reverse order so that we hit the parity
2003 * last -- any errors along the way will force us to read the parity.
2005 for (c = rm->rm_cols - 1; c >= 0; c--) {
2006 rc = &rm->rm_col[c];
2007 cvd = vd->vdev_child[rc->rc_devidx];
2008 if (!vdev_readable(cvd)) {
2009 if (c >= rm->rm_firstdatacol)
2010 rm->rm_missingdata++;
2011 else
2012 rm->rm_missingparity++;
2013 rc->rc_error = SET_ERROR(ENXIO);
2014 rc->rc_tried = 1; /* don't even try */
2015 rc->rc_skipped = 1;
2016 continue;
2018 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2019 if (c >= rm->rm_firstdatacol)
2020 rm->rm_missingdata++;
2021 else
2022 rm->rm_missingparity++;
2023 rc->rc_error = SET_ERROR(ESTALE);
2024 rc->rc_skipped = 1;
2025 continue;
2027 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2028 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2029 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2030 rc->rc_offset, rc->rc_abd, rc->rc_size,
2031 zio->io_type, zio->io_priority, 0,
2032 vdev_raidz_child_done, rc));
2036 zio_execute(zio);
2041 * Report a checksum error for a child of a RAID-Z device.
2043 static void
2044 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2046 void *buf;
2047 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2049 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2050 zio_bad_cksum_t zbc;
2051 raidz_map_t *rm = zio->io_vsd;
2053 mutex_enter(&vd->vdev_stat_lock);
2054 vd->vdev_stat.vs_checksum_errors++;
2055 mutex_exit(&vd->vdev_stat_lock);
2057 zbc.zbc_has_cksum = 0;
2058 zbc.zbc_injected = rm->rm_ecksuminjected;
2060 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2061 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2062 rc->rc_offset, rc->rc_size, buf, bad_data,
2063 &zbc);
2064 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2069 * We keep track of whether or not there were any injected errors, so that
2070 * any ereports we generate can note it.
2072 static int
2073 raidz_checksum_verify(zio_t *zio)
2075 zio_bad_cksum_t zbc;
2076 raidz_map_t *rm = zio->io_vsd;
2078 int ret = zio_checksum_error(zio, &zbc);
2079 if (ret != 0 && zbc.zbc_injected != 0)
2080 rm->rm_ecksuminjected = 1;
2082 return (ret);
2086 * Generate the parity from the data columns. If we tried and were able to
2087 * read the parity without error, verify that the generated parity matches the
2088 * data we read. If it doesn't, we fire off a checksum error. Return the
2089 * number such failures.
2091 static int
2092 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2094 void *orig[VDEV_RAIDZ_MAXPARITY];
2095 int c, ret = 0;
2096 raidz_col_t *rc;
2098 blkptr_t *bp = zio->io_bp;
2099 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2100 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2102 if (checksum == ZIO_CHECKSUM_NOPARITY)
2103 return (ret);
2105 for (c = 0; c < rm->rm_firstdatacol; c++) {
2106 rc = &rm->rm_col[c];
2107 if (!rc->rc_tried || rc->rc_error != 0)
2108 continue;
2109 orig[c] = zio_buf_alloc(rc->rc_size);
2110 abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2113 vdev_raidz_generate_parity(rm);
2115 for (c = 0; c < rm->rm_firstdatacol; c++) {
2116 rc = &rm->rm_col[c];
2117 if (!rc->rc_tried || rc->rc_error != 0)
2118 continue;
2119 if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2120 raidz_checksum_error(zio, rc, orig[c]);
2121 rc->rc_error = SET_ERROR(ECKSUM);
2122 ret++;
2124 zio_buf_free(orig[c], rc->rc_size);
2127 return (ret);
2131 * Keep statistics on all the ways that we used parity to correct data.
2133 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2135 static int
2136 vdev_raidz_worst_error(raidz_map_t *rm)
2138 int error = 0;
2140 for (int c = 0; c < rm->rm_cols; c++)
2141 error = zio_worst_error(error, rm->rm_col[c].rc_error);
2143 return (error);
2147 * Iterate over all combinations of bad data and attempt a reconstruction.
2148 * Note that the algorithm below is non-optimal because it doesn't take into
2149 * account how reconstruction is actually performed. For example, with
2150 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2151 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2152 * cases we'd only use parity information in column 0.
2154 static int
2155 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2157 raidz_map_t *rm = zio->io_vsd;
2158 raidz_col_t *rc;
2159 void *orig[VDEV_RAIDZ_MAXPARITY];
2160 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2161 int *tgts = &tstore[1];
2162 int current, next, i, c, n;
2163 int code, ret = 0;
2165 ASSERT(total_errors < rm->rm_firstdatacol);
2168 * This simplifies one edge condition.
2170 tgts[-1] = -1;
2172 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2174 * Initialize the targets array by finding the first n columns
2175 * that contain no error.
2177 * If there were no data errors, we need to ensure that we're
2178 * always explicitly attempting to reconstruct at least one
2179 * data column. To do this, we simply push the highest target
2180 * up into the data columns.
2182 for (c = 0, i = 0; i < n; i++) {
2183 if (i == n - 1 && data_errors == 0 &&
2184 c < rm->rm_firstdatacol) {
2185 c = rm->rm_firstdatacol;
2188 while (rm->rm_col[c].rc_error != 0) {
2189 c++;
2190 ASSERT3S(c, <, rm->rm_cols);
2193 tgts[i] = c++;
2197 * Setting tgts[n] simplifies the other edge condition.
2199 tgts[n] = rm->rm_cols;
2202 * These buffers were allocated in previous iterations.
2204 for (i = 0; i < n - 1; i++) {
2205 ASSERT(orig[i] != NULL);
2208 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2210 current = 0;
2211 next = tgts[current];
2213 while (current != n) {
2214 tgts[current] = next;
2215 current = 0;
2218 * Save off the original data that we're going to
2219 * attempt to reconstruct.
2221 for (i = 0; i < n; i++) {
2222 ASSERT(orig[i] != NULL);
2223 c = tgts[i];
2224 ASSERT3S(c, >=, 0);
2225 ASSERT3S(c, <, rm->rm_cols);
2226 rc = &rm->rm_col[c];
2227 abd_copy_to_buf(orig[i], rc->rc_abd,
2228 rc->rc_size);
2232 * Attempt a reconstruction and exit the outer loop on
2233 * success.
2235 code = vdev_raidz_reconstruct(rm, tgts, n);
2236 if (raidz_checksum_verify(zio) == 0) {
2237 atomic_inc_64(&raidz_corrected[code]);
2239 for (i = 0; i < n; i++) {
2240 c = tgts[i];
2241 rc = &rm->rm_col[c];
2242 ASSERT(rc->rc_error == 0);
2243 if (rc->rc_tried)
2244 raidz_checksum_error(zio, rc,
2245 orig[i]);
2246 rc->rc_error = SET_ERROR(ECKSUM);
2249 ret = code;
2250 goto done;
2254 * Restore the original data.
2256 for (i = 0; i < n; i++) {
2257 c = tgts[i];
2258 rc = &rm->rm_col[c];
2259 abd_copy_from_buf(rc->rc_abd, orig[i],
2260 rc->rc_size);
2263 do {
2265 * Find the next valid column after the current
2266 * position..
2268 for (next = tgts[current] + 1;
2269 next < rm->rm_cols &&
2270 rm->rm_col[next].rc_error != 0; next++)
2271 continue;
2273 ASSERT(next <= tgts[current + 1]);
2276 * If that spot is available, we're done here.
2278 if (next != tgts[current + 1])
2279 break;
2282 * Otherwise, find the next valid column after
2283 * the previous position.
2285 for (c = tgts[current - 1] + 1;
2286 rm->rm_col[c].rc_error != 0; c++)
2287 continue;
2289 tgts[current] = c;
2290 current++;
2292 } while (current != n);
2295 n--;
2296 done:
2297 for (i = 0; i < n; i++) {
2298 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2301 return (ret);
2305 * Complete an IO operation on a RAIDZ VDev
2307 * Outline:
2308 * - For write operations:
2309 * 1. Check for errors on the child IOs.
2310 * 2. Return, setting an error code if too few child VDevs were written
2311 * to reconstruct the data later. Note that partial writes are
2312 * considered successful if they can be reconstructed at all.
2313 * - For read operations:
2314 * 1. Check for errors on the child IOs.
2315 * 2. If data errors occurred:
2316 * a. Try to reassemble the data from the parity available.
2317 * b. If we haven't yet read the parity drives, read them now.
2318 * c. If all parity drives have been read but the data still doesn't
2319 * reassemble with a correct checksum, then try combinatorial
2320 * reconstruction.
2321 * d. If that doesn't work, return an error.
2322 * 3. If there were unexpected errors or this is a resilver operation,
2323 * rewrite the vdevs that had errors.
2325 static void
2326 vdev_raidz_io_done(zio_t *zio)
2328 vdev_t *vd = zio->io_vd;
2329 vdev_t *cvd;
2330 raidz_map_t *rm = zio->io_vsd;
2331 raidz_col_t *rc;
2332 int unexpected_errors = 0;
2333 int parity_errors = 0;
2334 int parity_untried = 0;
2335 int data_errors = 0;
2336 int total_errors = 0;
2337 int n, c;
2338 int tgts[VDEV_RAIDZ_MAXPARITY];
2339 int code;
2341 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
2343 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2344 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2346 for (c = 0; c < rm->rm_cols; c++) {
2347 rc = &rm->rm_col[c];
2349 if (rc->rc_error) {
2350 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2352 if (c < rm->rm_firstdatacol)
2353 parity_errors++;
2354 else
2355 data_errors++;
2357 if (!rc->rc_skipped)
2358 unexpected_errors++;
2360 total_errors++;
2361 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2362 parity_untried++;
2366 if (zio->io_type == ZIO_TYPE_WRITE) {
2368 * XXX -- for now, treat partial writes as a success.
2369 * (If we couldn't write enough columns to reconstruct
2370 * the data, the I/O failed. Otherwise, good enough.)
2372 * Now that we support write reallocation, it would be better
2373 * to treat partial failure as real failure unless there are
2374 * no non-degraded top-level vdevs left, and not update DTLs
2375 * if we intend to reallocate.
2377 /* XXPOLICY */
2378 if (total_errors > rm->rm_firstdatacol)
2379 zio->io_error = vdev_raidz_worst_error(rm);
2381 return;
2384 ASSERT(zio->io_type == ZIO_TYPE_READ);
2386 * There are three potential phases for a read:
2387 * 1. produce valid data from the columns read
2388 * 2. read all disks and try again
2389 * 3. perform combinatorial reconstruction
2391 * Each phase is progressively both more expensive and less likely to
2392 * occur. If we encounter more errors than we can repair or all phases
2393 * fail, we have no choice but to return an error.
2397 * If the number of errors we saw was correctable -- less than or equal
2398 * to the number of parity disks read -- attempt to produce data that
2399 * has a valid checksum. Naturally, this case applies in the absence of
2400 * any errors.
2402 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2403 if (data_errors == 0) {
2404 if (raidz_checksum_verify(zio) == 0) {
2406 * If we read parity information (unnecessarily
2407 * as it happens since no reconstruction was
2408 * needed) regenerate and verify the parity.
2409 * We also regenerate parity when resilvering
2410 * so we can write it out to the failed device
2411 * later.
2413 if (parity_errors + parity_untried <
2414 rm->rm_firstdatacol ||
2415 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2416 n = raidz_parity_verify(zio, rm);
2417 unexpected_errors += n;
2418 ASSERT(parity_errors + n <=
2419 rm->rm_firstdatacol);
2421 goto done;
2423 } else {
2425 * We either attempt to read all the parity columns or
2426 * none of them. If we didn't try to read parity, we
2427 * wouldn't be here in the correctable case. There must
2428 * also have been fewer parity errors than parity
2429 * columns or, again, we wouldn't be in this code path.
2431 ASSERT(parity_untried == 0);
2432 ASSERT(parity_errors < rm->rm_firstdatacol);
2435 * Identify the data columns that reported an error.
2437 n = 0;
2438 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2439 rc = &rm->rm_col[c];
2440 if (rc->rc_error != 0) {
2441 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2442 tgts[n++] = c;
2446 ASSERT(rm->rm_firstdatacol >= n);
2448 code = vdev_raidz_reconstruct(rm, tgts, n);
2450 if (raidz_checksum_verify(zio) == 0) {
2451 atomic_inc_64(&raidz_corrected[code]);
2454 * If we read more parity disks than were used
2455 * for reconstruction, confirm that the other
2456 * parity disks produced correct data. This
2457 * routine is suboptimal in that it regenerates
2458 * the parity that we already used in addition
2459 * to the parity that we're attempting to
2460 * verify, but this should be a relatively
2461 * uncommon case, and can be optimized if it
2462 * becomes a problem. Note that we regenerate
2463 * parity when resilvering so we can write it
2464 * out to failed devices later.
2466 if (parity_errors < rm->rm_firstdatacol - n ||
2467 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2468 n = raidz_parity_verify(zio, rm);
2469 unexpected_errors += n;
2470 ASSERT(parity_errors + n <=
2471 rm->rm_firstdatacol);
2474 goto done;
2480 * This isn't a typical situation -- either we got a read error or
2481 * a child silently returned bad data. Read every block so we can
2482 * try again with as much data and parity as we can track down. If
2483 * we've already been through once before, all children will be marked
2484 * as tried so we'll proceed to combinatorial reconstruction.
2486 unexpected_errors = 1;
2487 rm->rm_missingdata = 0;
2488 rm->rm_missingparity = 0;
2490 for (c = 0; c < rm->rm_cols; c++) {
2491 if (rm->rm_col[c].rc_tried)
2492 continue;
2494 zio_vdev_io_redone(zio);
2495 do {
2496 rc = &rm->rm_col[c];
2497 if (rc->rc_tried)
2498 continue;
2499 zio_nowait(zio_vdev_child_io(zio, NULL,
2500 vd->vdev_child[rc->rc_devidx],
2501 rc->rc_offset, rc->rc_abd, rc->rc_size,
2502 zio->io_type, zio->io_priority, 0,
2503 vdev_raidz_child_done, rc));
2504 } while (++c < rm->rm_cols);
2506 return;
2510 * At this point we've attempted to reconstruct the data given the
2511 * errors we detected, and we've attempted to read all columns. There
2512 * must, therefore, be one or more additional problems -- silent errors
2513 * resulting in invalid data rather than explicit I/O errors resulting
2514 * in absent data. We check if there is enough additional data to
2515 * possibly reconstruct the data and then perform combinatorial
2516 * reconstruction over all possible combinations. If that fails,
2517 * we're cooked.
2519 if (total_errors > rm->rm_firstdatacol) {
2520 zio->io_error = vdev_raidz_worst_error(rm);
2522 } else if (total_errors < rm->rm_firstdatacol &&
2523 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2525 * If we didn't use all the available parity for the
2526 * combinatorial reconstruction, verify that the remaining
2527 * parity is correct.
2529 if (code != (1 << rm->rm_firstdatacol) - 1)
2530 (void) raidz_parity_verify(zio, rm);
2531 } else {
2533 * We're here because either:
2535 * total_errors == rm_first_datacol, or
2536 * vdev_raidz_combrec() failed
2538 * In either case, there is enough bad data to prevent
2539 * reconstruction.
2541 * Start checksum ereports for all children which haven't
2542 * failed, and the IO wasn't speculative.
2544 zio->io_error = SET_ERROR(ECKSUM);
2546 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2547 for (c = 0; c < rm->rm_cols; c++) {
2548 rc = &rm->rm_col[c];
2549 if (rc->rc_error == 0) {
2550 zio_bad_cksum_t zbc;
2551 zbc.zbc_has_cksum = 0;
2552 zbc.zbc_injected =
2553 rm->rm_ecksuminjected;
2555 zfs_ereport_start_checksum(
2556 zio->io_spa,
2557 vd->vdev_child[rc->rc_devidx],
2558 zio, rc->rc_offset, rc->rc_size,
2559 (void *)(uintptr_t)c, &zbc);
2565 done:
2566 zio_checksum_verified(zio);
2568 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2569 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2571 * Use the good data we have in hand to repair damaged children.
2573 for (c = 0; c < rm->rm_cols; c++) {
2574 rc = &rm->rm_col[c];
2575 cvd = vd->vdev_child[rc->rc_devidx];
2577 if (rc->rc_error == 0)
2578 continue;
2580 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2581 rc->rc_offset, rc->rc_abd, rc->rc_size,
2582 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2583 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2584 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2589 static void
2590 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2592 if (faulted > vd->vdev_nparity)
2593 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2594 VDEV_AUX_NO_REPLICAS);
2595 else if (degraded + faulted != 0)
2596 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2597 else
2598 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2601 static void
2602 vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2604 vdev_t *raidvd = cvd->vdev_parent;
2605 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2607 uint64_t width = raidvd->vdev_children;
2608 uint64_t tgt_col = cvd->vdev_id;
2609 uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2611 /* make sure the offsets are block-aligned */
2612 ASSERT0(in->rs_start % (1 << ashift));
2613 ASSERT0(in->rs_end % (1 << ashift));
2614 uint64_t b_start = in->rs_start >> ashift;
2615 uint64_t b_end = in->rs_end >> ashift;
2617 uint64_t start_row = 0;
2618 if (b_start > tgt_col) /* avoid underflow */
2619 start_row = ((b_start - tgt_col - 1) / width) + 1;
2621 uint64_t end_row = 0;
2622 if (b_end > tgt_col)
2623 end_row = ((b_end - tgt_col - 1) / width) + 1;
2625 res->rs_start = start_row << ashift;
2626 res->rs_end = end_row << ashift;
2628 ASSERT3U(res->rs_start, <=, in->rs_start);
2629 ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2632 vdev_ops_t vdev_raidz_ops = {
2633 vdev_raidz_open,
2634 vdev_raidz_close,
2635 vdev_raidz_asize,
2636 vdev_raidz_io_start,
2637 vdev_raidz_io_done,
2638 vdev_raidz_state_change,
2639 NULL,
2640 NULL,
2641 NULL,
2642 vdev_raidz_xlate,
2643 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2644 B_FALSE /* not a leaf vdev */