4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
29 #include <sys/zfs_context.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_disk.h>
33 #include <sys/vdev_file.h>
34 #include <sys/vdev_raidz.h>
36 #include <sys/zio_checksum.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/fm/fs/zfs.h>
42 * Virtual device vector for RAID-Z.
44 * This vdev supports single, double, and triple parity. For single parity,
45 * we use a simple XOR of all the data columns. For double or triple parity,
46 * we use a special case of Reed-Solomon coding. This extends the
47 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
48 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
49 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
50 * former is also based. The latter is designed to provide higher performance
53 * Note that the Plank paper claimed to support arbitrary N+M, but was then
54 * amended six years later identifying a critical flaw that invalidates its
55 * claims. Nevertheless, the technique can be adapted to work for up to
56 * triple parity. For additional parity, the amendment "Note: Correction to
57 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
58 * is viable, but the additional complexity means that write performance will
61 * All of the methods above operate on a Galois field, defined over the
62 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
63 * can be expressed with a single byte. Briefly, the operations on the
64 * field are defined as follows:
66 * o addition (+) is represented by a bitwise XOR
67 * o subtraction (-) is therefore identical to addition: A + B = A - B
68 * o multiplication of A by 2 is defined by the following bitwise expression:
73 * (A * 2)_4 = A_3 + A_7
74 * (A * 2)_3 = A_2 + A_7
75 * (A * 2)_2 = A_1 + A_7
79 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
80 * As an aside, this multiplication is derived from the error correcting
81 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
83 * Observe that any number in the field (except for 0) can be expressed as a
84 * power of 2 -- a generator for the field. We store a table of the powers of
85 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
86 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
87 * than field addition). The inverse of a field element A (A^-1) is therefore
88 * A ^ (255 - 1) = A^254.
90 * The up-to-three parity columns, P, Q, R over several data columns,
91 * D_0, ... D_n-1, can be expressed by field operations:
93 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
94 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
95 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
96 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
97 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
99 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
100 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
101 * independent coefficients. (There are no additional coefficients that have
102 * this property which is why the uncorrected Plank method breaks down.)
104 * See the reconstruction code below for how P, Q and R can used individually
105 * or in concert to recover missing data columns.
108 typedef struct raidz_col
{
109 uint64_t rc_devidx
; /* child device index for I/O */
110 uint64_t rc_offset
; /* device offset */
111 uint64_t rc_size
; /* I/O size */
112 abd_t
*rc_abd
; /* I/O data */
113 void *rc_gdata
; /* used to store the "good" version */
114 int rc_error
; /* I/O error for this device */
115 uint8_t rc_tried
; /* Did we attempt this I/O column? */
116 uint8_t rc_skipped
; /* Did we skip this I/O column? */
119 typedef struct raidz_map
{
120 uint64_t rm_cols
; /* Regular column count */
121 uint64_t rm_scols
; /* Count including skipped columns */
122 uint64_t rm_bigcols
; /* Number of oversized columns */
123 uint64_t rm_asize
; /* Actual total I/O size */
124 uint64_t rm_missingdata
; /* Count of missing data devices */
125 uint64_t rm_missingparity
; /* Count of missing parity devices */
126 uint64_t rm_firstdatacol
; /* First data column/parity count */
127 uint64_t rm_nskip
; /* Skipped sectors for padding */
128 uint64_t rm_skipstart
; /* Column index of padding start */
129 abd_t
*rm_abd_copy
; /* rm_asize-buffer of copied data */
130 uintptr_t rm_reports
; /* # of referencing checksum reports */
131 uint8_t rm_freed
; /* map no longer has referencing ZIO */
132 uint8_t rm_ecksuminjected
; /* checksum error was injected */
133 raidz_col_t rm_col
[1]; /* Flexible array of I/O columns */
136 #define VDEV_RAIDZ_P 0
137 #define VDEV_RAIDZ_Q 1
138 #define VDEV_RAIDZ_R 2
140 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
141 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
144 * We provide a mechanism to perform the field multiplication operation on a
145 * 64-bit value all at once rather than a byte at a time. This works by
146 * creating a mask from the top bit in each byte and using that to
147 * conditionally apply the XOR of 0x1d.
149 #define VDEV_RAIDZ_64MUL_2(x, mask) \
151 (mask) = (x) & 0x8080808080808080ULL; \
152 (mask) = ((mask) << 1) - ((mask) >> 7); \
153 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
154 ((mask) & 0x1d1d1d1d1d1d1d1d); \
157 #define VDEV_RAIDZ_64MUL_4(x, mask) \
159 VDEV_RAIDZ_64MUL_2((x), mask); \
160 VDEV_RAIDZ_64MUL_2((x), mask); \
163 #define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
166 * Force reconstruction to use the general purpose method.
168 int vdev_raidz_default_to_general
;
170 /* Powers of 2 in the Galois field defined above. */
171 static const uint8_t vdev_raidz_pow2
[256] = {
172 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
173 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
174 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
175 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
176 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
177 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
178 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
179 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
180 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
181 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
182 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
183 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
184 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
185 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
186 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
187 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
188 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
189 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
190 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
191 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
192 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
193 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
194 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
195 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
196 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
197 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
198 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
199 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
200 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
201 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
202 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
203 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
205 /* Logs of 2 in the Galois field defined above. */
206 static const uint8_t vdev_raidz_log2
[256] = {
207 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
208 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
209 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
210 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
211 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
212 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
213 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
214 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
215 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
216 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
217 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
218 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
219 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
220 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
221 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
222 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
223 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
224 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
225 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
226 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
227 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
228 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
229 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
230 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
231 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
232 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
233 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
234 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
235 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
236 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
237 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
238 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
241 static void vdev_raidz_generate_parity(raidz_map_t
*rm
);
244 * Multiply a given number by 2 raised to the given power.
247 vdev_raidz_exp2(uint_t a
, int exp
)
253 ASSERT(vdev_raidz_log2
[a
] > 0 || a
== 1);
255 exp
+= vdev_raidz_log2
[a
];
259 return (vdev_raidz_pow2
[exp
]);
263 vdev_raidz_map_free(raidz_map_t
*rm
)
268 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
269 abd_free(rm
->rm_col
[c
].rc_abd
);
271 if (rm
->rm_col
[c
].rc_gdata
!= NULL
)
272 zio_buf_free(rm
->rm_col
[c
].rc_gdata
,
273 rm
->rm_col
[c
].rc_size
);
277 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
278 abd_put(rm
->rm_col
[c
].rc_abd
);
279 size
+= rm
->rm_col
[c
].rc_size
;
282 if (rm
->rm_abd_copy
!= NULL
)
283 abd_free(rm
->rm_abd_copy
);
285 kmem_free(rm
, offsetof(raidz_map_t
, rm_col
[rm
->rm_scols
]));
289 vdev_raidz_map_free_vsd(zio_t
*zio
)
291 raidz_map_t
*rm
= zio
->io_vsd
;
293 ASSERT0(rm
->rm_freed
);
296 if (rm
->rm_reports
== 0)
297 vdev_raidz_map_free(rm
);
302 vdev_raidz_cksum_free(void *arg
, size_t ignored
)
304 raidz_map_t
*rm
= arg
;
306 ASSERT3U(rm
->rm_reports
, >, 0);
308 if (--rm
->rm_reports
== 0 && rm
->rm_freed
!= 0)
309 vdev_raidz_map_free(rm
);
313 vdev_raidz_cksum_finish(zio_cksum_report_t
*zcr
, const void *good_data
)
315 raidz_map_t
*rm
= zcr
->zcr_cbdata
;
316 size_t c
= zcr
->zcr_cbinfo
;
319 const char *good
= NULL
;
322 if (good_data
== NULL
) {
323 zfs_ereport_finish_checksum(zcr
, NULL
, NULL
, B_FALSE
);
327 if (c
< rm
->rm_firstdatacol
) {
329 * The first time through, calculate the parity blocks for
330 * the good data (this relies on the fact that the good
331 * data never changes for a given logical ZIO)
333 if (rm
->rm_col
[0].rc_gdata
== NULL
) {
334 abd_t
*bad_parity
[VDEV_RAIDZ_MAXPARITY
];
339 * Set up the rm_col[]s to generate the parity for
340 * good_data, first saving the parity bufs and
341 * replacing them with buffers to hold the result.
343 for (x
= 0; x
< rm
->rm_firstdatacol
; x
++) {
344 bad_parity
[x
] = rm
->rm_col
[x
].rc_abd
;
345 rm
->rm_col
[x
].rc_gdata
=
346 zio_buf_alloc(rm
->rm_col
[x
].rc_size
);
347 rm
->rm_col
[x
].rc_abd
=
348 abd_get_from_buf(rm
->rm_col
[x
].rc_gdata
,
349 rm
->rm_col
[x
].rc_size
);
352 /* fill in the data columns from good_data */
353 buf
= (char *)good_data
;
354 for (; x
< rm
->rm_cols
; x
++) {
355 abd_put(rm
->rm_col
[x
].rc_abd
);
356 rm
->rm_col
[x
].rc_abd
= abd_get_from_buf(buf
,
357 rm
->rm_col
[x
].rc_size
);
358 buf
+= rm
->rm_col
[x
].rc_size
;
362 * Construct the parity from the good data.
364 vdev_raidz_generate_parity(rm
);
366 /* restore everything back to its original state */
367 for (x
= 0; x
< rm
->rm_firstdatacol
; x
++) {
368 abd_put(rm
->rm_col
[x
].rc_abd
);
369 rm
->rm_col
[x
].rc_abd
= bad_parity
[x
];
373 for (x
= rm
->rm_firstdatacol
; x
< rm
->rm_cols
; x
++) {
374 abd_put(rm
->rm_col
[x
].rc_abd
);
375 rm
->rm_col
[x
].rc_abd
= abd_get_offset(
376 rm
->rm_abd_copy
, offset
);
377 offset
+= rm
->rm_col
[x
].rc_size
;
381 ASSERT3P(rm
->rm_col
[c
].rc_gdata
, !=, NULL
);
382 good
= rm
->rm_col
[c
].rc_gdata
;
384 /* adjust good_data to point at the start of our column */
387 for (x
= rm
->rm_firstdatacol
; x
< c
; x
++)
388 good
+= rm
->rm_col
[x
].rc_size
;
391 bad
= abd_borrow_buf_copy(rm
->rm_col
[c
].rc_abd
, rm
->rm_col
[c
].rc_size
);
392 /* we drop the ereport if it ends up that the data was good */
393 zfs_ereport_finish_checksum(zcr
, good
, bad
, B_TRUE
);
394 abd_return_buf(rm
->rm_col
[c
].rc_abd
, bad
, rm
->rm_col
[c
].rc_size
);
398 * Invoked indirectly by zfs_ereport_start_checksum(), called
399 * below when our read operation fails completely. The main point
400 * is to keep a copy of everything we read from disk, so that at
401 * vdev_raidz_cksum_finish() time we can compare it with the good data.
404 vdev_raidz_cksum_report(zio_t
*zio
, zio_cksum_report_t
*zcr
, void *arg
)
406 size_t c
= (size_t)(uintptr_t)arg
;
409 raidz_map_t
*rm
= zio
->io_vsd
;
412 /* set up the report and bump the refcount */
413 zcr
->zcr_cbdata
= rm
;
415 zcr
->zcr_finish
= vdev_raidz_cksum_finish
;
416 zcr
->zcr_free
= vdev_raidz_cksum_free
;
419 ASSERT3U(rm
->rm_reports
, >, 0);
421 if (rm
->rm_abd_copy
!= NULL
)
425 * It's the first time we're called for this raidz_map_t, so we need
426 * to copy the data aside; there's no guarantee that our zio's buffer
427 * won't be re-used for something else.
429 * Our parity data is already in separate buffers, so there's no need
434 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++)
435 size
+= rm
->rm_col
[c
].rc_size
;
438 abd_alloc_sametype(rm
->rm_col
[rm
->rm_firstdatacol
].rc_abd
, size
);
440 for (offset
= 0, c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
441 raidz_col_t
*col
= &rm
->rm_col
[c
];
442 abd_t
*tmp
= abd_get_offset(rm
->rm_abd_copy
, offset
);
444 abd_copy(tmp
, col
->rc_abd
, col
->rc_size
);
445 abd_put(col
->rc_abd
);
448 offset
+= col
->rc_size
;
450 ASSERT3U(offset
, ==, size
);
453 static const zio_vsd_ops_t vdev_raidz_vsd_ops
= {
454 vdev_raidz_map_free_vsd
,
455 vdev_raidz_cksum_report
459 * Divides the IO evenly across all child vdevs; usually, dcols is
460 * the number of children in the target vdev.
463 vdev_raidz_map_alloc(abd_t
*abd
, uint64_t size
, uint64_t offset
,
464 uint64_t unit_shift
, uint64_t dcols
, uint64_t nparity
)
467 /* The starting RAIDZ (parent) vdev sector of the block. */
468 uint64_t b
= offset
>> unit_shift
;
469 /* The zio's size in units of the vdev's minimum sector size. */
470 uint64_t s
= size
>> unit_shift
;
471 /* The first column for this stripe. */
472 uint64_t f
= b
% dcols
;
473 /* The starting byte offset on each child vdev. */
474 uint64_t o
= (b
/ dcols
) << unit_shift
;
475 uint64_t q
, r
, c
, bc
, col
, acols
, scols
, coff
, devidx
, asize
, tot
;
479 * "Quotient": The number of data sectors for this stripe on all but
480 * the "big column" child vdevs that also contain "remainder" data.
482 q
= s
/ (dcols
- nparity
);
485 * "Remainder": The number of partial stripe data sectors in this I/O.
486 * This will add a sector to some, but not all, child vdevs.
488 r
= s
- q
* (dcols
- nparity
);
490 /* The number of "big columns" - those which contain remainder data. */
491 bc
= (r
== 0 ? 0 : r
+ nparity
);
494 * The total number of data and parity sectors associated with
497 tot
= s
+ nparity
* (q
+ (r
== 0 ? 0 : 1));
499 /* acols: The columns that will be accessed. */
500 /* scols: The columns that will be accessed or skipped. */
502 /* Our I/O request doesn't span all child vdevs. */
504 scols
= MIN(dcols
, roundup(bc
, nparity
+ 1));
510 ASSERT3U(acols
, <=, scols
);
512 rm
= kmem_alloc(offsetof(raidz_map_t
, rm_col
[scols
]), KM_SLEEP
);
515 rm
->rm_scols
= scols
;
517 rm
->rm_skipstart
= bc
;
518 rm
->rm_missingdata
= 0;
519 rm
->rm_missingparity
= 0;
520 rm
->rm_firstdatacol
= nparity
;
521 rm
->rm_abd_copy
= NULL
;
524 rm
->rm_ecksuminjected
= 0;
528 for (c
= 0; c
< scols
; c
++) {
533 coff
+= 1ULL << unit_shift
;
535 rm
->rm_col
[c
].rc_devidx
= col
;
536 rm
->rm_col
[c
].rc_offset
= coff
;
537 rm
->rm_col
[c
].rc_abd
= NULL
;
538 rm
->rm_col
[c
].rc_gdata
= NULL
;
539 rm
->rm_col
[c
].rc_error
= 0;
540 rm
->rm_col
[c
].rc_tried
= 0;
541 rm
->rm_col
[c
].rc_skipped
= 0;
544 rm
->rm_col
[c
].rc_size
= 0;
546 rm
->rm_col
[c
].rc_size
= (q
+ 1) << unit_shift
;
548 rm
->rm_col
[c
].rc_size
= q
<< unit_shift
;
550 asize
+= rm
->rm_col
[c
].rc_size
;
553 ASSERT3U(asize
, ==, tot
<< unit_shift
);
554 rm
->rm_asize
= roundup(asize
, (nparity
+ 1) << unit_shift
);
555 rm
->rm_nskip
= roundup(tot
, nparity
+ 1) - tot
;
556 ASSERT3U(rm
->rm_asize
- asize
, ==, rm
->rm_nskip
<< unit_shift
);
557 ASSERT3U(rm
->rm_nskip
, <=, nparity
);
559 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++)
560 rm
->rm_col
[c
].rc_abd
=
561 abd_alloc_linear(rm
->rm_col
[c
].rc_size
, B_TRUE
);
563 rm
->rm_col
[c
].rc_abd
= abd_get_offset(abd
, 0);
564 off
= rm
->rm_col
[c
].rc_size
;
566 for (c
= c
+ 1; c
< acols
; c
++) {
567 rm
->rm_col
[c
].rc_abd
= abd_get_offset(abd
, off
);
568 off
+= rm
->rm_col
[c
].rc_size
;
572 * If all data stored spans all columns, there's a danger that parity
573 * will always be on the same device and, since parity isn't read
574 * during normal operation, that that device's I/O bandwidth won't be
575 * used effectively. We therefore switch the parity every 1MB.
577 * ... at least that was, ostensibly, the theory. As a practical
578 * matter unless we juggle the parity between all devices evenly, we
579 * won't see any benefit. Further, occasional writes that aren't a
580 * multiple of the LCM of the number of children and the minimum
581 * stripe width are sufficient to avoid pessimal behavior.
582 * Unfortunately, this decision created an implicit on-disk format
583 * requirement that we need to support for all eternity, but only
584 * for single-parity RAID-Z.
586 * If we intend to skip a sector in the zeroth column for padding
587 * we must make sure to note this swap. We will never intend to
588 * skip the first column since at least one data and one parity
589 * column must appear in each row.
591 ASSERT(rm
->rm_cols
>= 2);
592 ASSERT(rm
->rm_col
[0].rc_size
== rm
->rm_col
[1].rc_size
);
594 if (rm
->rm_firstdatacol
== 1 && (offset
& (1ULL << 20))) {
595 devidx
= rm
->rm_col
[0].rc_devidx
;
596 o
= rm
->rm_col
[0].rc_offset
;
597 rm
->rm_col
[0].rc_devidx
= rm
->rm_col
[1].rc_devidx
;
598 rm
->rm_col
[0].rc_offset
= rm
->rm_col
[1].rc_offset
;
599 rm
->rm_col
[1].rc_devidx
= devidx
;
600 rm
->rm_col
[1].rc_offset
= o
;
602 if (rm
->rm_skipstart
== 0)
603 rm
->rm_skipstart
= 1;
616 vdev_raidz_p_func(void *buf
, size_t size
, void *private)
618 struct pqr_struct
*pqr
= private;
619 const uint64_t *src
= buf
;
620 int i
, cnt
= size
/ sizeof (src
[0]);
622 ASSERT(pqr
->p
&& !pqr
->q
&& !pqr
->r
);
624 for (i
= 0; i
< cnt
; i
++, src
++, pqr
->p
++)
631 vdev_raidz_pq_func(void *buf
, size_t size
, void *private)
633 struct pqr_struct
*pqr
= private;
634 const uint64_t *src
= buf
;
636 int i
, cnt
= size
/ sizeof (src
[0]);
638 ASSERT(pqr
->p
&& pqr
->q
&& !pqr
->r
);
640 for (i
= 0; i
< cnt
; i
++, src
++, pqr
->p
++, pqr
->q
++) {
642 VDEV_RAIDZ_64MUL_2(*pqr
->q
, mask
);
650 vdev_raidz_pqr_func(void *buf
, size_t size
, void *private)
652 struct pqr_struct
*pqr
= private;
653 const uint64_t *src
= buf
;
655 int i
, cnt
= size
/ sizeof (src
[0]);
657 ASSERT(pqr
->p
&& pqr
->q
&& pqr
->r
);
659 for (i
= 0; i
< cnt
; i
++, src
++, pqr
->p
++, pqr
->q
++, pqr
->r
++) {
661 VDEV_RAIDZ_64MUL_2(*pqr
->q
, mask
);
663 VDEV_RAIDZ_64MUL_4(*pqr
->r
, mask
);
671 vdev_raidz_generate_parity_p(raidz_map_t
*rm
)
677 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
678 src
= rm
->rm_col
[c
].rc_abd
;
679 p
= abd_to_buf(rm
->rm_col
[VDEV_RAIDZ_P
].rc_abd
);
681 if (c
== rm
->rm_firstdatacol
) {
682 abd_copy_to_buf(p
, src
, rm
->rm_col
[c
].rc_size
);
684 struct pqr_struct pqr
= { p
, NULL
, NULL
};
685 (void) abd_iterate_func(src
, 0, rm
->rm_col
[c
].rc_size
,
686 vdev_raidz_p_func
, &pqr
);
692 vdev_raidz_generate_parity_pq(raidz_map_t
*rm
)
694 uint64_t *p
, *q
, pcnt
, ccnt
, mask
, i
;
698 pcnt
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (p
[0]);
699 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
700 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
702 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
703 src
= rm
->rm_col
[c
].rc_abd
;
704 p
= abd_to_buf(rm
->rm_col
[VDEV_RAIDZ_P
].rc_abd
);
705 q
= abd_to_buf(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_abd
);
707 ccnt
= rm
->rm_col
[c
].rc_size
/ sizeof (p
[0]);
709 if (c
== rm
->rm_firstdatacol
) {
710 abd_copy_to_buf(p
, src
, rm
->rm_col
[c
].rc_size
);
711 (void) memcpy(q
, p
, rm
->rm_col
[c
].rc_size
);
713 struct pqr_struct pqr
= { p
, q
, NULL
};
714 (void) abd_iterate_func(src
, 0, rm
->rm_col
[c
].rc_size
,
715 vdev_raidz_pq_func
, &pqr
);
718 if (c
== rm
->rm_firstdatacol
) {
719 for (i
= ccnt
; i
< pcnt
; i
++) {
725 * Treat short columns as though they are full of 0s.
726 * Note that there's therefore nothing needed for P.
728 for (i
= ccnt
; i
< pcnt
; i
++) {
729 VDEV_RAIDZ_64MUL_2(q
[i
], mask
);
736 vdev_raidz_generate_parity_pqr(raidz_map_t
*rm
)
738 uint64_t *p
, *q
, *r
, pcnt
, ccnt
, mask
, i
;
742 pcnt
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (p
[0]);
743 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
744 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
745 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
746 rm
->rm_col
[VDEV_RAIDZ_R
].rc_size
);
748 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
749 src
= rm
->rm_col
[c
].rc_abd
;
750 p
= abd_to_buf(rm
->rm_col
[VDEV_RAIDZ_P
].rc_abd
);
751 q
= abd_to_buf(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_abd
);
752 r
= abd_to_buf(rm
->rm_col
[VDEV_RAIDZ_R
].rc_abd
);
754 ccnt
= rm
->rm_col
[c
].rc_size
/ sizeof (p
[0]);
756 if (c
== rm
->rm_firstdatacol
) {
757 abd_copy_to_buf(p
, src
, rm
->rm_col
[c
].rc_size
);
758 (void) memcpy(q
, p
, rm
->rm_col
[c
].rc_size
);
759 (void) memcpy(r
, p
, rm
->rm_col
[c
].rc_size
);
761 struct pqr_struct pqr
= { p
, q
, r
};
762 (void) abd_iterate_func(src
, 0, rm
->rm_col
[c
].rc_size
,
763 vdev_raidz_pqr_func
, &pqr
);
766 if (c
== rm
->rm_firstdatacol
) {
767 for (i
= ccnt
; i
< pcnt
; i
++) {
774 * Treat short columns as though they are full of 0s.
775 * Note that there's therefore nothing needed for P.
777 for (i
= ccnt
; i
< pcnt
; i
++) {
778 VDEV_RAIDZ_64MUL_2(q
[i
], mask
);
779 VDEV_RAIDZ_64MUL_4(r
[i
], mask
);
786 * Generate RAID parity in the first virtual columns according to the number of
787 * parity columns available.
790 vdev_raidz_generate_parity(raidz_map_t
*rm
)
792 switch (rm
->rm_firstdatacol
) {
794 vdev_raidz_generate_parity_p(rm
);
797 vdev_raidz_generate_parity_pq(rm
);
800 vdev_raidz_generate_parity_pqr(rm
);
803 cmn_err(CE_PANIC
, "invalid RAID-Z configuration");
809 vdev_raidz_reconst_p_func(void *dbuf
, void *sbuf
, size_t size
, void *private)
811 uint64_t *dst
= dbuf
;
812 uint64_t *src
= sbuf
;
813 int cnt
= size
/ sizeof (src
[0]);
815 for (int i
= 0; i
< cnt
; i
++) {
824 vdev_raidz_reconst_q_pre_func(void *dbuf
, void *sbuf
, size_t size
,
827 uint64_t *dst
= dbuf
;
828 uint64_t *src
= sbuf
;
830 int cnt
= size
/ sizeof (dst
[0]);
832 for (int i
= 0; i
< cnt
; i
++, dst
++, src
++) {
833 VDEV_RAIDZ_64MUL_2(*dst
, mask
);
842 vdev_raidz_reconst_q_pre_tail_func(void *buf
, size_t size
, void *private)
846 int cnt
= size
/ sizeof (dst
[0]);
848 for (int i
= 0; i
< cnt
; i
++, dst
++) {
849 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
850 VDEV_RAIDZ_64MUL_2(*dst
, mask
);
856 struct reconst_q_struct
{
862 vdev_raidz_reconst_q_post_func(void *buf
, size_t size
, void *private)
864 struct reconst_q_struct
*rq
= private;
866 int cnt
= size
/ sizeof (dst
[0]);
868 for (int i
= 0; i
< cnt
; i
++, dst
++, rq
->q
++) {
873 for (j
= 0, b
= (uint8_t *)dst
; j
< 8; j
++, b
++) {
874 *b
= vdev_raidz_exp2(*b
, rq
->exp
);
881 struct reconst_pq_struct
{
891 vdev_raidz_reconst_pq_func(void *xbuf
, void *ybuf
, size_t size
, void *private)
893 struct reconst_pq_struct
*rpq
= private;
897 for (int i
= 0; i
< size
;
898 i
++, rpq
->p
++, rpq
->q
++, rpq
->pxy
++, rpq
->qxy
++, xd
++, yd
++) {
899 *xd
= vdev_raidz_exp2(*rpq
->p
^ *rpq
->pxy
, rpq
->aexp
) ^
900 vdev_raidz_exp2(*rpq
->q
^ *rpq
->qxy
, rpq
->bexp
);
901 *yd
= *rpq
->p
^ *rpq
->pxy
^ *xd
;
908 vdev_raidz_reconst_pq_tail_func(void *xbuf
, size_t size
, void *private)
910 struct reconst_pq_struct
*rpq
= private;
913 for (int i
= 0; i
< size
;
914 i
++, rpq
->p
++, rpq
->q
++, rpq
->pxy
++, rpq
->qxy
++, xd
++) {
915 /* same operation as vdev_raidz_reconst_pq_func() on xd */
916 *xd
= vdev_raidz_exp2(*rpq
->p
^ *rpq
->pxy
, rpq
->aexp
) ^
917 vdev_raidz_exp2(*rpq
->q
^ *rpq
->qxy
, rpq
->bexp
);
924 vdev_raidz_reconstruct_p(raidz_map_t
*rm
, int *tgts
, int ntgts
)
931 ASSERT(x
>= rm
->rm_firstdatacol
);
932 ASSERT(x
< rm
->rm_cols
);
934 ASSERT(rm
->rm_col
[x
].rc_size
<= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
);
935 ASSERT(rm
->rm_col
[x
].rc_size
> 0);
937 src
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_abd
;
938 dst
= rm
->rm_col
[x
].rc_abd
;
940 abd_copy(dst
, src
, rm
->rm_col
[x
].rc_size
);
942 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
943 uint64_t size
= MIN(rm
->rm_col
[x
].rc_size
,
944 rm
->rm_col
[c
].rc_size
);
946 src
= rm
->rm_col
[c
].rc_abd
;
947 dst
= rm
->rm_col
[x
].rc_abd
;
952 (void) abd_iterate_func2(dst
, src
, 0, 0, size
,
953 vdev_raidz_reconst_p_func
, NULL
);
956 return (1 << VDEV_RAIDZ_P
);
960 vdev_raidz_reconstruct_q(raidz_map_t
*rm
, int *tgts
, int ntgts
)
968 ASSERT(rm
->rm_col
[x
].rc_size
<= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
970 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
971 uint64_t size
= (c
== x
) ? 0 : MIN(rm
->rm_col
[x
].rc_size
,
972 rm
->rm_col
[c
].rc_size
);
974 src
= rm
->rm_col
[c
].rc_abd
;
975 dst
= rm
->rm_col
[x
].rc_abd
;
977 if (c
== rm
->rm_firstdatacol
) {
978 abd_copy(dst
, src
, size
);
979 if (rm
->rm_col
[x
].rc_size
> size
)
980 abd_zero_off(dst
, size
,
981 rm
->rm_col
[x
].rc_size
- size
);
983 ASSERT3U(size
, <=, rm
->rm_col
[x
].rc_size
);
984 (void) abd_iterate_func2(dst
, src
, 0, 0, size
,
985 vdev_raidz_reconst_q_pre_func
, NULL
);
986 (void) abd_iterate_func(dst
,
987 size
, rm
->rm_col
[x
].rc_size
- size
,
988 vdev_raidz_reconst_q_pre_tail_func
, NULL
);
992 src
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_abd
;
993 dst
= rm
->rm_col
[x
].rc_abd
;
994 exp
= 255 - (rm
->rm_cols
- 1 - x
);
996 struct reconst_q_struct rq
= { abd_to_buf(src
), exp
};
997 (void) abd_iterate_func(dst
, 0, rm
->rm_col
[x
].rc_size
,
998 vdev_raidz_reconst_q_post_func
, &rq
);
1000 return (1 << VDEV_RAIDZ_Q
);
1004 vdev_raidz_reconstruct_pq(raidz_map_t
*rm
, int *tgts
, int ntgts
)
1006 uint8_t *p
, *q
, *pxy
, *qxy
, tmp
, a
, b
, aexp
, bexp
;
1007 abd_t
*pdata
, *qdata
;
1008 uint64_t xsize
, ysize
;
1015 ASSERT(x
>= rm
->rm_firstdatacol
);
1016 ASSERT(y
< rm
->rm_cols
);
1018 ASSERT(rm
->rm_col
[x
].rc_size
>= rm
->rm_col
[y
].rc_size
);
1021 * Move the parity data aside -- we're going to compute parity as
1022 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1023 * reuse the parity generation mechanism without trashing the actual
1024 * parity so we make those columns appear to be full of zeros by
1025 * setting their lengths to zero.
1027 pdata
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_abd
;
1028 qdata
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_abd
;
1029 xsize
= rm
->rm_col
[x
].rc_size
;
1030 ysize
= rm
->rm_col
[y
].rc_size
;
1032 rm
->rm_col
[VDEV_RAIDZ_P
].rc_abd
=
1033 abd_alloc_linear(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
, B_TRUE
);
1034 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_abd
=
1035 abd_alloc_linear(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
, B_TRUE
);
1036 rm
->rm_col
[x
].rc_size
= 0;
1037 rm
->rm_col
[y
].rc_size
= 0;
1039 vdev_raidz_generate_parity_pq(rm
);
1041 rm
->rm_col
[x
].rc_size
= xsize
;
1042 rm
->rm_col
[y
].rc_size
= ysize
;
1044 p
= abd_to_buf(pdata
);
1045 q
= abd_to_buf(qdata
);
1046 pxy
= abd_to_buf(rm
->rm_col
[VDEV_RAIDZ_P
].rc_abd
);
1047 qxy
= abd_to_buf(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_abd
);
1048 xd
= rm
->rm_col
[x
].rc_abd
;
1049 yd
= rm
->rm_col
[y
].rc_abd
;
1053 * Pxy = P + D_x + D_y
1054 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1056 * We can then solve for D_x:
1057 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1059 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1060 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1062 * With D_x in hand, we can easily solve for D_y:
1063 * D_y = P + Pxy + D_x
1066 a
= vdev_raidz_pow2
[255 + x
- y
];
1067 b
= vdev_raidz_pow2
[255 - (rm
->rm_cols
- 1 - x
)];
1068 tmp
= 255 - vdev_raidz_log2
[a
^ 1];
1070 aexp
= vdev_raidz_log2
[vdev_raidz_exp2(a
, tmp
)];
1071 bexp
= vdev_raidz_log2
[vdev_raidz_exp2(b
, tmp
)];
1073 ASSERT3U(xsize
, >=, ysize
);
1074 struct reconst_pq_struct rpq
= { p
, q
, pxy
, qxy
, aexp
, bexp
};
1075 (void) abd_iterate_func2(xd
, yd
, 0, 0, ysize
,
1076 vdev_raidz_reconst_pq_func
, &rpq
);
1077 (void) abd_iterate_func(xd
, ysize
, xsize
- ysize
,
1078 vdev_raidz_reconst_pq_tail_func
, &rpq
);
1080 abd_free(rm
->rm_col
[VDEV_RAIDZ_P
].rc_abd
);
1081 abd_free(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_abd
);
1084 * Restore the saved parity data.
1086 rm
->rm_col
[VDEV_RAIDZ_P
].rc_abd
= pdata
;
1087 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_abd
= qdata
;
1089 return ((1 << VDEV_RAIDZ_P
) | (1 << VDEV_RAIDZ_Q
));
1094 * In the general case of reconstruction, we must solve the system of linear
1095 * equations defined by the coeffecients used to generate parity as well as
1096 * the contents of the data and parity disks. This can be expressed with
1097 * vectors for the original data (D) and the actual data (d) and parity (p)
1098 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1102 * | V | | D_0 | | p_m-1 |
1103 * | | x | : | = | d_0 |
1104 * | I | | D_n-1 | | : |
1105 * | | ~~ ~~ | d_n-1 |
1108 * I is simply a square identity matrix of size n, and V is a vandermonde
1109 * matrix defined by the coeffecients we chose for the various parity columns
1110 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1111 * computation as well as linear separability.
1114 * | 1 .. 1 1 1 | | p_0 |
1115 * | 2^n-1 .. 4 2 1 | __ __ | : |
1116 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1117 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1118 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1119 * | : : : : | | : | | d_2 |
1120 * | 0 .. 1 0 0 | | D_n-1 | | : |
1121 * | 0 .. 0 1 0 | ~~ ~~ | : |
1122 * | 0 .. 0 0 1 | | d_n-1 |
1125 * Note that I, V, d, and p are known. To compute D, we must invert the
1126 * matrix and use the known data and parity values to reconstruct the unknown
1127 * data values. We begin by removing the rows in V|I and d|p that correspond
1128 * to failed or missing columns; we then make V|I square (n x n) and d|p
1129 * sized n by removing rows corresponding to unused parity from the bottom up
1130 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1131 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1132 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1134 * | 1 1 1 1 1 1 1 1 |
1135 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1136 * | 19 205 116 29 64 16 4 1 | / /
1137 * | 1 0 0 0 0 0 0 0 | / /
1138 * | 0 1 0 0 0 0 0 0 | <--' /
1139 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1140 * | 0 0 0 1 0 0 0 0 |
1141 * | 0 0 0 0 1 0 0 0 |
1142 * | 0 0 0 0 0 1 0 0 |
1143 * | 0 0 0 0 0 0 1 0 |
1144 * | 0 0 0 0 0 0 0 1 |
1147 * | 1 1 1 1 1 1 1 1 |
1148 * | 19 205 116 29 64 16 4 1 |
1149 * | 1 0 0 0 0 0 0 0 |
1150 * (V|I)' = | 0 0 0 1 0 0 0 0 |
1151 * | 0 0 0 0 1 0 0 0 |
1152 * | 0 0 0 0 0 1 0 0 |
1153 * | 0 0 0 0 0 0 1 0 |
1154 * | 0 0 0 0 0 0 0 1 |
1157 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1158 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1159 * matrix is not singular.
1161 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1162 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1163 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1164 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1165 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1166 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1167 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1168 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1171 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1172 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1173 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1174 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1175 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1176 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1177 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1178 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1181 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1182 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1183 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1184 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1185 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1186 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1187 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1188 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1191 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1192 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1193 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1194 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1195 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1196 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1197 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1198 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1201 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1202 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1203 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1204 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1205 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1206 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1207 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1208 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1211 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1212 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1213 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1214 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1215 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1216 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1217 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1218 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1221 * | 0 0 1 0 0 0 0 0 |
1222 * | 167 100 5 41 159 169 217 208 |
1223 * | 166 100 4 40 158 168 216 209 |
1224 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1225 * | 0 0 0 0 1 0 0 0 |
1226 * | 0 0 0 0 0 1 0 0 |
1227 * | 0 0 0 0 0 0 1 0 |
1228 * | 0 0 0 0 0 0 0 1 |
1231 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1232 * of the missing data.
1234 * As is apparent from the example above, the only non-trivial rows in the
1235 * inverse matrix correspond to the data disks that we're trying to
1236 * reconstruct. Indeed, those are the only rows we need as the others would
1237 * only be useful for reconstructing data known or assumed to be valid. For
1238 * that reason, we only build the coefficients in the rows that correspond to
1244 vdev_raidz_matrix_init(raidz_map_t
*rm
, int n
, int nmap
, int *map
,
1250 ASSERT(n
== rm
->rm_cols
- rm
->rm_firstdatacol
);
1253 * Fill in the missing rows of interest.
1255 for (i
= 0; i
< nmap
; i
++) {
1256 ASSERT3S(0, <=, map
[i
]);
1257 ASSERT3S(map
[i
], <=, 2);
1264 for (j
= 0; j
< n
; j
++) {
1268 rows
[i
][j
] = vdev_raidz_pow2
[pow
];
1274 vdev_raidz_matrix_invert(raidz_map_t
*rm
, int n
, int nmissing
, int *missing
,
1275 uint8_t **rows
, uint8_t **invrows
, const uint8_t *used
)
1281 * Assert that the first nmissing entries from the array of used
1282 * columns correspond to parity columns and that subsequent entries
1283 * correspond to data columns.
1285 for (i
= 0; i
< nmissing
; i
++) {
1286 ASSERT3S(used
[i
], <, rm
->rm_firstdatacol
);
1288 for (; i
< n
; i
++) {
1289 ASSERT3S(used
[i
], >=, rm
->rm_firstdatacol
);
1293 * First initialize the storage where we'll compute the inverse rows.
1295 for (i
= 0; i
< nmissing
; i
++) {
1296 for (j
= 0; j
< n
; j
++) {
1297 invrows
[i
][j
] = (i
== j
) ? 1 : 0;
1302 * Subtract all trivial rows from the rows of consequence.
1304 for (i
= 0; i
< nmissing
; i
++) {
1305 for (j
= nmissing
; j
< n
; j
++) {
1306 ASSERT3U(used
[j
], >=, rm
->rm_firstdatacol
);
1307 jj
= used
[j
] - rm
->rm_firstdatacol
;
1309 invrows
[i
][j
] = rows
[i
][jj
];
1315 * For each of the rows of interest, we must normalize it and subtract
1316 * a multiple of it from the other rows.
1318 for (i
= 0; i
< nmissing
; i
++) {
1319 for (j
= 0; j
< missing
[i
]; j
++) {
1320 ASSERT0(rows
[i
][j
]);
1322 ASSERT3U(rows
[i
][missing
[i
]], !=, 0);
1325 * Compute the inverse of the first element and multiply each
1326 * element in the row by that value.
1328 log
= 255 - vdev_raidz_log2
[rows
[i
][missing
[i
]]];
1330 for (j
= 0; j
< n
; j
++) {
1331 rows
[i
][j
] = vdev_raidz_exp2(rows
[i
][j
], log
);
1332 invrows
[i
][j
] = vdev_raidz_exp2(invrows
[i
][j
], log
);
1335 for (ii
= 0; ii
< nmissing
; ii
++) {
1339 ASSERT3U(rows
[ii
][missing
[i
]], !=, 0);
1341 log
= vdev_raidz_log2
[rows
[ii
][missing
[i
]]];
1343 for (j
= 0; j
< n
; j
++) {
1345 vdev_raidz_exp2(rows
[i
][j
], log
);
1347 vdev_raidz_exp2(invrows
[i
][j
], log
);
1353 * Verify that the data that is left in the rows are properly part of
1354 * an identity matrix.
1356 for (i
= 0; i
< nmissing
; i
++) {
1357 for (j
= 0; j
< n
; j
++) {
1358 if (j
== missing
[i
]) {
1359 ASSERT3U(rows
[i
][j
], ==, 1);
1361 ASSERT0(rows
[i
][j
]);
1368 vdev_raidz_matrix_reconstruct(raidz_map_t
*rm
, int n
, int nmissing
,
1369 int *missing
, uint8_t **invrows
, const uint8_t *used
)
1374 uint8_t *dst
[VDEV_RAIDZ_MAXPARITY
];
1375 uint64_t dcount
[VDEV_RAIDZ_MAXPARITY
];
1379 uint8_t *invlog
[VDEV_RAIDZ_MAXPARITY
];
1383 psize
= sizeof (invlog
[0][0]) * n
* nmissing
;
1384 p
= kmem_alloc(psize
, KM_SLEEP
);
1386 for (pp
= p
, i
= 0; i
< nmissing
; i
++) {
1391 for (i
= 0; i
< nmissing
; i
++) {
1392 for (j
= 0; j
< n
; j
++) {
1393 ASSERT3U(invrows
[i
][j
], !=, 0);
1394 invlog
[i
][j
] = vdev_raidz_log2
[invrows
[i
][j
]];
1398 for (i
= 0; i
< n
; i
++) {
1400 ASSERT3U(c
, <, rm
->rm_cols
);
1402 src
= abd_to_buf(rm
->rm_col
[c
].rc_abd
);
1403 ccount
= rm
->rm_col
[c
].rc_size
;
1404 for (j
= 0; j
< nmissing
; j
++) {
1405 cc
= missing
[j
] + rm
->rm_firstdatacol
;
1406 ASSERT3U(cc
, >=, rm
->rm_firstdatacol
);
1407 ASSERT3U(cc
, <, rm
->rm_cols
);
1408 ASSERT3U(cc
, !=, c
);
1410 dst
[j
] = abd_to_buf(rm
->rm_col
[cc
].rc_abd
);
1411 dcount
[j
] = rm
->rm_col
[cc
].rc_size
;
1414 ASSERT(ccount
>= rm
->rm_col
[missing
[0]].rc_size
|| i
> 0);
1416 for (x
= 0; x
< ccount
; x
++, src
++) {
1418 log
= vdev_raidz_log2
[*src
];
1420 for (cc
= 0; cc
< nmissing
; cc
++) {
1421 if (x
>= dcount
[cc
])
1427 if ((ll
= log
+ invlog
[cc
][i
]) >= 255)
1429 val
= vdev_raidz_pow2
[ll
];
1440 kmem_free(p
, psize
);
1444 vdev_raidz_reconstruct_general(raidz_map_t
*rm
, int *tgts
, int ntgts
)
1448 int missing_rows
[VDEV_RAIDZ_MAXPARITY
];
1449 int parity_map
[VDEV_RAIDZ_MAXPARITY
];
1454 uint8_t *rows
[VDEV_RAIDZ_MAXPARITY
];
1455 uint8_t *invrows
[VDEV_RAIDZ_MAXPARITY
];
1458 abd_t
**bufs
= NULL
;
1463 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1464 * temporary linear ABDs.
1466 if (!abd_is_linear(rm
->rm_col
[rm
->rm_firstdatacol
].rc_abd
)) {
1467 bufs
= kmem_alloc(rm
->rm_cols
* sizeof (abd_t
*), KM_PUSHPAGE
);
1469 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
1470 raidz_col_t
*col
= &rm
->rm_col
[c
];
1472 bufs
[c
] = col
->rc_abd
;
1473 col
->rc_abd
= abd_alloc_linear(col
->rc_size
, B_TRUE
);
1474 abd_copy(col
->rc_abd
, bufs
[c
], col
->rc_size
);
1478 n
= rm
->rm_cols
- rm
->rm_firstdatacol
;
1481 * Figure out which data columns are missing.
1484 for (t
= 0; t
< ntgts
; t
++) {
1485 if (tgts
[t
] >= rm
->rm_firstdatacol
) {
1486 missing_rows
[nmissing_rows
++] =
1487 tgts
[t
] - rm
->rm_firstdatacol
;
1492 * Figure out which parity columns to use to help generate the missing
1495 for (tt
= 0, c
= 0, i
= 0; i
< nmissing_rows
; c
++) {
1497 ASSERT(c
< rm
->rm_firstdatacol
);
1500 * Skip any targeted parity columns.
1502 if (c
== tgts
[tt
]) {
1514 ASSERT3U(code
, <, 1 << VDEV_RAIDZ_MAXPARITY
);
1516 psize
= (sizeof (rows
[0][0]) + sizeof (invrows
[0][0])) *
1517 nmissing_rows
* n
+ sizeof (used
[0]) * n
;
1518 p
= kmem_alloc(psize
, KM_SLEEP
);
1520 for (pp
= p
, i
= 0; i
< nmissing_rows
; i
++) {
1528 for (i
= 0; i
< nmissing_rows
; i
++) {
1529 used
[i
] = parity_map
[i
];
1532 for (tt
= 0, c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
1533 if (tt
< nmissing_rows
&&
1534 c
== missing_rows
[tt
] + rm
->rm_firstdatacol
) {
1545 * Initialize the interesting rows of the matrix.
1547 vdev_raidz_matrix_init(rm
, n
, nmissing_rows
, parity_map
, rows
);
1550 * Invert the matrix.
1552 vdev_raidz_matrix_invert(rm
, n
, nmissing_rows
, missing_rows
, rows
,
1556 * Reconstruct the missing data using the generated matrix.
1558 vdev_raidz_matrix_reconstruct(rm
, n
, nmissing_rows
, missing_rows
,
1561 kmem_free(p
, psize
);
1564 * copy back from temporary linear abds and free them
1567 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
1568 raidz_col_t
*col
= &rm
->rm_col
[c
];
1570 abd_copy(bufs
[c
], col
->rc_abd
, col
->rc_size
);
1571 abd_free(col
->rc_abd
);
1572 col
->rc_abd
= bufs
[c
];
1574 kmem_free(bufs
, rm
->rm_cols
* sizeof (abd_t
*));
1581 vdev_raidz_reconstruct(raidz_map_t
*rm
, int *t
, int nt
)
1583 int tgts
[VDEV_RAIDZ_MAXPARITY
], *dt
;
1587 int nbadparity
, nbaddata
;
1588 int parity_valid
[VDEV_RAIDZ_MAXPARITY
];
1591 * The tgts list must already be sorted.
1593 for (i
= 1; i
< nt
; i
++) {
1594 ASSERT(t
[i
] > t
[i
- 1]);
1597 nbadparity
= rm
->rm_firstdatacol
;
1598 nbaddata
= rm
->rm_cols
- nbadparity
;
1600 for (i
= 0, c
= 0; c
< rm
->rm_cols
; c
++) {
1601 if (c
< rm
->rm_firstdatacol
)
1602 parity_valid
[c
] = B_FALSE
;
1604 if (i
< nt
&& c
== t
[i
]) {
1607 } else if (rm
->rm_col
[c
].rc_error
!= 0) {
1609 } else if (c
>= rm
->rm_firstdatacol
) {
1612 parity_valid
[c
] = B_TRUE
;
1617 ASSERT(ntgts
>= nt
);
1618 ASSERT(nbaddata
>= 0);
1619 ASSERT(nbaddata
+ nbadparity
== ntgts
);
1621 dt
= &tgts
[nbadparity
];
1624 * See if we can use any of our optimized reconstruction routines.
1626 if (!vdev_raidz_default_to_general
) {
1629 if (parity_valid
[VDEV_RAIDZ_P
])
1630 return (vdev_raidz_reconstruct_p(rm
, dt
, 1));
1632 ASSERT(rm
->rm_firstdatacol
> 1);
1634 if (parity_valid
[VDEV_RAIDZ_Q
])
1635 return (vdev_raidz_reconstruct_q(rm
, dt
, 1));
1637 ASSERT(rm
->rm_firstdatacol
> 2);
1641 ASSERT(rm
->rm_firstdatacol
> 1);
1643 if (parity_valid
[VDEV_RAIDZ_P
] &&
1644 parity_valid
[VDEV_RAIDZ_Q
])
1645 return (vdev_raidz_reconstruct_pq(rm
, dt
, 2));
1647 ASSERT(rm
->rm_firstdatacol
> 2);
1653 code
= vdev_raidz_reconstruct_general(rm
, tgts
, ntgts
);
1654 ASSERT(code
< (1 << VDEV_RAIDZ_MAXPARITY
));
1660 vdev_raidz_open(vdev_t
*vd
, uint64_t *asize
, uint64_t *max_asize
,
1664 uint64_t nparity
= vd
->vdev_nparity
;
1669 ASSERT(nparity
> 0);
1671 if (nparity
> VDEV_RAIDZ_MAXPARITY
||
1672 vd
->vdev_children
< nparity
+ 1) {
1673 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
1674 return (SET_ERROR(EINVAL
));
1677 vdev_open_children(vd
);
1679 for (c
= 0; c
< vd
->vdev_children
; c
++) {
1680 cvd
= vd
->vdev_child
[c
];
1682 if (cvd
->vdev_open_error
!= 0) {
1683 lasterror
= cvd
->vdev_open_error
;
1688 *asize
= MIN(*asize
- 1, cvd
->vdev_asize
- 1) + 1;
1689 *max_asize
= MIN(*max_asize
- 1, cvd
->vdev_max_asize
- 1) + 1;
1690 *ashift
= MAX(*ashift
, cvd
->vdev_ashift
);
1693 *asize
*= vd
->vdev_children
;
1694 *max_asize
*= vd
->vdev_children
;
1696 if (numerrors
> nparity
) {
1697 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NO_REPLICAS
;
1705 vdev_raidz_close(vdev_t
*vd
)
1709 for (c
= 0; c
< vd
->vdev_children
; c
++)
1710 vdev_close(vd
->vdev_child
[c
]);
1714 * Handle a read or write I/O to a RAID-Z dump device.
1716 * The dump device is in a unique situation compared to other ZFS datasets:
1717 * writing to this device should be as simple and fast as possible. In
1718 * addition, durability matters much less since the dump will be extracted
1719 * once the machine reboots. For that reason, this function eschews parity for
1720 * performance and simplicity. The dump device uses the checksum setting
1721 * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1724 * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
1725 * 128 KB will not fill an entire block; in addition, they may not be properly
1726 * aligned. In that case, this function uses the preallocated 128 KB block and
1727 * omits reading or writing any "empty" portions of that block, as opposed to
1728 * allocating a fresh appropriately-sized block.
1730 * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1732 * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1734 * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1735 * allocated which spans all five child vdevs. 8 KB of data would be written to
1736 * each of four vdevs, with the fifth containing the parity bits.
1738 * parity data data data data
1739 * | PP | XX | XX | XX | XX |
1742 * 8 KB parity ------8 KB data blocks------
1744 * However, when writing to the dump device, the behavior is different:
1746 * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1748 * Unlike the normal RAID-Z case in which the block is allocated based on the
1749 * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
1750 * I/O size is less than 128 KB, only the actual portions of data are written.
1751 * In this example the data is written to the third data vdev since that vdev
1752 * contains the offset [64 KB, 96 KB).
1754 * parity data data data data
1760 * As a result, an individual I/O may not span all child vdevs; moreover, a
1761 * small I/O may only operate on a single child vdev.
1763 * Note that since there are no parity bits calculated or written, this format
1764 * remains the same no matter how many parity bits are used in a normal RAID-Z
1765 * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
1768 * parity parity parity data data data data
1769 * | | | | | | XX | |
1775 vdev_raidz_physio(vdev_t
*vd
, caddr_t data
, size_t size
,
1776 uint64_t offset
, uint64_t origoffset
, boolean_t doread
, boolean_t isdump
)
1778 vdev_t
*tvd
= vd
->vdev_top
;
1784 uint64_t start
, end
, colstart
, colend
;
1785 uint64_t coloffset
, colsize
, colskip
;
1787 int flags
= doread
? B_READ
: B_WRITE
;
1792 * Don't write past the end of the block
1794 VERIFY3U(offset
+ size
, <=, origoffset
+ SPA_OLD_MAXBLOCKSIZE
);
1800 * Allocate a RAID-Z map for this block. Note that this block starts
1801 * from the "original" offset, this is, the offset of the extent which
1802 * contains the requisite offset of the data being read or written.
1804 * Even if this I/O operation doesn't span the full block size, let's
1805 * treat the on-disk format as if the only blocks are the complete 128
1808 abd_t
*abd
= abd_get_from_buf(data
- (offset
- origoffset
),
1809 SPA_OLD_MAXBLOCKSIZE
);
1810 rm
= vdev_raidz_map_alloc(abd
,
1811 SPA_OLD_MAXBLOCKSIZE
, origoffset
, tvd
->vdev_ashift
,
1812 vd
->vdev_children
, vd
->vdev_nparity
);
1814 coloffset
= origoffset
;
1816 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
;
1817 c
++, coloffset
+= rc
->rc_size
) {
1818 rc
= &rm
->rm_col
[c
];
1819 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1822 * Find the start and end of this column in the RAID-Z map,
1823 * keeping in mind that the stated size and offset of the
1824 * operation may not fill the entire column for this vdev.
1826 * If any portion of the data spans this column, issue the
1827 * appropriate operation to the vdev.
1829 if (coloffset
+ rc
->rc_size
<= start
)
1831 if (coloffset
>= end
)
1834 colstart
= MAX(coloffset
, start
);
1835 colend
= MIN(end
, coloffset
+ rc
->rc_size
);
1836 colsize
= colend
- colstart
;
1837 colskip
= colstart
- coloffset
;
1839 VERIFY3U(colsize
, <=, rc
->rc_size
);
1840 VERIFY3U(colskip
, <=, rc
->rc_size
);
1843 * Note that the child vdev will have a vdev label at the start
1844 * of its range of offsets, hence the need for
1845 * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
1846 * example of why this calculation is needed.
1848 if ((err
= vdev_disk_physio(cvd
,
1849 ((char *)abd_to_buf(rc
->rc_abd
)) + colskip
, colsize
,
1850 VDEV_LABEL_OFFSET(rc
->rc_offset
) + colskip
,
1851 flags
, isdump
)) != 0)
1855 vdev_raidz_map_free(rm
);
1863 vdev_raidz_asize(vdev_t
*vd
, uint64_t psize
)
1866 uint64_t ashift
= vd
->vdev_top
->vdev_ashift
;
1867 uint64_t cols
= vd
->vdev_children
;
1868 uint64_t nparity
= vd
->vdev_nparity
;
1870 asize
= ((psize
- 1) >> ashift
) + 1;
1871 asize
+= nparity
* ((asize
+ cols
- nparity
- 1) / (cols
- nparity
));
1872 asize
= roundup(asize
, nparity
+ 1) << ashift
;
1878 vdev_raidz_child_done(zio_t
*zio
)
1880 raidz_col_t
*rc
= zio
->io_private
;
1882 rc
->rc_error
= zio
->io_error
;
1888 * Start an IO operation on a RAIDZ VDev
1891 * - For write operations:
1892 * 1. Generate the parity data
1893 * 2. Create child zio write operations to each column's vdev, for both
1895 * 3. If the column skips any sectors for padding, create optional dummy
1896 * write zio children for those areas to improve aggregation continuity.
1897 * - For read operations:
1898 * 1. Create child zio read operations to each data column's vdev to read
1899 * the range of data required for zio.
1900 * 2. If this is a scrub or resilver operation, or if any of the data
1901 * vdevs have had errors, then create zio read operations to the parity
1902 * columns' VDevs as well.
1905 vdev_raidz_io_start(zio_t
*zio
)
1907 vdev_t
*vd
= zio
->io_vd
;
1908 vdev_t
*tvd
= vd
->vdev_top
;
1914 rm
= vdev_raidz_map_alloc(zio
->io_abd
, zio
->io_size
, zio
->io_offset
,
1915 tvd
->vdev_ashift
, vd
->vdev_children
,
1919 zio
->io_vsd_ops
= &vdev_raidz_vsd_ops
;
1921 ASSERT3U(rm
->rm_asize
, ==, vdev_psize_to_asize(vd
, zio
->io_size
));
1923 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
1924 vdev_raidz_generate_parity(rm
);
1926 for (c
= 0; c
< rm
->rm_cols
; c
++) {
1927 rc
= &rm
->rm_col
[c
];
1928 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1929 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1930 rc
->rc_offset
, rc
->rc_abd
, rc
->rc_size
,
1931 zio
->io_type
, zio
->io_priority
, 0,
1932 vdev_raidz_child_done
, rc
));
1936 * Generate optional I/Os for any skipped sectors to improve
1937 * aggregation contiguity.
1939 for (c
= rm
->rm_skipstart
, i
= 0; i
< rm
->rm_nskip
; c
++, i
++) {
1940 ASSERT(c
<= rm
->rm_scols
);
1941 if (c
== rm
->rm_scols
)
1943 rc
= &rm
->rm_col
[c
];
1944 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1945 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1946 rc
->rc_offset
+ rc
->rc_size
, NULL
,
1947 1 << tvd
->vdev_ashift
,
1948 zio
->io_type
, zio
->io_priority
,
1949 ZIO_FLAG_NODATA
| ZIO_FLAG_OPTIONAL
, NULL
, NULL
));
1956 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
1959 * Iterate over the columns in reverse order so that we hit the parity
1960 * last -- any errors along the way will force us to read the parity.
1962 for (c
= rm
->rm_cols
- 1; c
>= 0; c
--) {
1963 rc
= &rm
->rm_col
[c
];
1964 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1965 if (!vdev_readable(cvd
)) {
1966 if (c
>= rm
->rm_firstdatacol
)
1967 rm
->rm_missingdata
++;
1969 rm
->rm_missingparity
++;
1970 rc
->rc_error
= SET_ERROR(ENXIO
);
1971 rc
->rc_tried
= 1; /* don't even try */
1975 if (vdev_dtl_contains(cvd
, DTL_MISSING
, zio
->io_txg
, 1)) {
1976 if (c
>= rm
->rm_firstdatacol
)
1977 rm
->rm_missingdata
++;
1979 rm
->rm_missingparity
++;
1980 rc
->rc_error
= SET_ERROR(ESTALE
);
1984 if (c
>= rm
->rm_firstdatacol
|| rm
->rm_missingdata
> 0 ||
1985 (zio
->io_flags
& (ZIO_FLAG_SCRUB
| ZIO_FLAG_RESILVER
))) {
1986 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1987 rc
->rc_offset
, rc
->rc_abd
, rc
->rc_size
,
1988 zio
->io_type
, zio
->io_priority
, 0,
1989 vdev_raidz_child_done
, rc
));
1998 * Report a checksum error for a child of a RAID-Z device.
2001 raidz_checksum_error(zio_t
*zio
, raidz_col_t
*rc
, void *bad_data
)
2004 vdev_t
*vd
= zio
->io_vd
->vdev_child
[rc
->rc_devidx
];
2006 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
2007 zio_bad_cksum_t zbc
;
2008 raidz_map_t
*rm
= zio
->io_vsd
;
2010 mutex_enter(&vd
->vdev_stat_lock
);
2011 vd
->vdev_stat
.vs_checksum_errors
++;
2012 mutex_exit(&vd
->vdev_stat_lock
);
2014 zbc
.zbc_has_cksum
= 0;
2015 zbc
.zbc_injected
= rm
->rm_ecksuminjected
;
2017 buf
= abd_borrow_buf_copy(rc
->rc_abd
, rc
->rc_size
);
2018 zfs_ereport_post_checksum(zio
->io_spa
, vd
, zio
,
2019 rc
->rc_offset
, rc
->rc_size
, buf
, bad_data
,
2021 abd_return_buf(rc
->rc_abd
, buf
, rc
->rc_size
);
2026 * We keep track of whether or not there were any injected errors, so that
2027 * any ereports we generate can note it.
2030 raidz_checksum_verify(zio_t
*zio
)
2032 zio_bad_cksum_t zbc
;
2033 raidz_map_t
*rm
= zio
->io_vsd
;
2035 int ret
= zio_checksum_error(zio
, &zbc
);
2036 if (ret
!= 0 && zbc
.zbc_injected
!= 0)
2037 rm
->rm_ecksuminjected
= 1;
2043 * Generate the parity from the data columns. If we tried and were able to
2044 * read the parity without error, verify that the generated parity matches the
2045 * data we read. If it doesn't, we fire off a checksum error. Return the
2046 * number such failures.
2049 raidz_parity_verify(zio_t
*zio
, raidz_map_t
*rm
)
2051 void *orig
[VDEV_RAIDZ_MAXPARITY
];
2055 blkptr_t
*bp
= zio
->io_bp
;
2056 enum zio_checksum checksum
= (bp
== NULL
? zio
->io_prop
.zp_checksum
:
2057 (BP_IS_GANG(bp
) ? ZIO_CHECKSUM_GANG_HEADER
: BP_GET_CHECKSUM(bp
)));
2059 if (checksum
== ZIO_CHECKSUM_NOPARITY
)
2062 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
2063 rc
= &rm
->rm_col
[c
];
2064 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
2066 orig
[c
] = zio_buf_alloc(rc
->rc_size
);
2067 abd_copy_to_buf(orig
[c
], rc
->rc_abd
, rc
->rc_size
);
2070 vdev_raidz_generate_parity(rm
);
2072 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
2073 rc
= &rm
->rm_col
[c
];
2074 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
2076 if (abd_cmp_buf(rc
->rc_abd
, orig
[c
], rc
->rc_size
) != 0) {
2077 raidz_checksum_error(zio
, rc
, orig
[c
]);
2078 rc
->rc_error
= SET_ERROR(ECKSUM
);
2081 zio_buf_free(orig
[c
], rc
->rc_size
);
2088 * Keep statistics on all the ways that we used parity to correct data.
2090 static uint64_t raidz_corrected
[1 << VDEV_RAIDZ_MAXPARITY
];
2093 vdev_raidz_worst_error(raidz_map_t
*rm
)
2097 for (int c
= 0; c
< rm
->rm_cols
; c
++)
2098 error
= zio_worst_error(error
, rm
->rm_col
[c
].rc_error
);
2104 * Iterate over all combinations of bad data and attempt a reconstruction.
2105 * Note that the algorithm below is non-optimal because it doesn't take into
2106 * account how reconstruction is actually performed. For example, with
2107 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2108 * is targeted as invalid as if columns 1 and 4 are targeted since in both
2109 * cases we'd only use parity information in column 0.
2112 vdev_raidz_combrec(zio_t
*zio
, int total_errors
, int data_errors
)
2114 raidz_map_t
*rm
= zio
->io_vsd
;
2116 void *orig
[VDEV_RAIDZ_MAXPARITY
];
2117 int tstore
[VDEV_RAIDZ_MAXPARITY
+ 2];
2118 int *tgts
= &tstore
[1];
2119 int current
, next
, i
, c
, n
;
2122 ASSERT(total_errors
< rm
->rm_firstdatacol
);
2125 * This simplifies one edge condition.
2129 for (n
= 1; n
<= rm
->rm_firstdatacol
- total_errors
; n
++) {
2131 * Initialize the targets array by finding the first n columns
2132 * that contain no error.
2134 * If there were no data errors, we need to ensure that we're
2135 * always explicitly attempting to reconstruct at least one
2136 * data column. To do this, we simply push the highest target
2137 * up into the data columns.
2139 for (c
= 0, i
= 0; i
< n
; i
++) {
2140 if (i
== n
- 1 && data_errors
== 0 &&
2141 c
< rm
->rm_firstdatacol
) {
2142 c
= rm
->rm_firstdatacol
;
2145 while (rm
->rm_col
[c
].rc_error
!= 0) {
2147 ASSERT3S(c
, <, rm
->rm_cols
);
2154 * Setting tgts[n] simplifies the other edge condition.
2156 tgts
[n
] = rm
->rm_cols
;
2159 * These buffers were allocated in previous iterations.
2161 for (i
= 0; i
< n
- 1; i
++) {
2162 ASSERT(orig
[i
] != NULL
);
2165 orig
[n
- 1] = zio_buf_alloc(rm
->rm_col
[0].rc_size
);
2168 next
= tgts
[current
];
2170 while (current
!= n
) {
2171 tgts
[current
] = next
;
2175 * Save off the original data that we're going to
2176 * attempt to reconstruct.
2178 for (i
= 0; i
< n
; i
++) {
2179 ASSERT(orig
[i
] != NULL
);
2182 ASSERT3S(c
, <, rm
->rm_cols
);
2183 rc
= &rm
->rm_col
[c
];
2184 abd_copy_to_buf(orig
[i
], rc
->rc_abd
,
2189 * Attempt a reconstruction and exit the outer loop on
2192 code
= vdev_raidz_reconstruct(rm
, tgts
, n
);
2193 if (raidz_checksum_verify(zio
) == 0) {
2194 atomic_inc_64(&raidz_corrected
[code
]);
2196 for (i
= 0; i
< n
; i
++) {
2198 rc
= &rm
->rm_col
[c
];
2199 ASSERT(rc
->rc_error
== 0);
2201 raidz_checksum_error(zio
, rc
,
2203 rc
->rc_error
= SET_ERROR(ECKSUM
);
2211 * Restore the original data.
2213 for (i
= 0; i
< n
; i
++) {
2215 rc
= &rm
->rm_col
[c
];
2216 abd_copy_from_buf(rc
->rc_abd
, orig
[i
],
2222 * Find the next valid column after the current
2225 for (next
= tgts
[current
] + 1;
2226 next
< rm
->rm_cols
&&
2227 rm
->rm_col
[next
].rc_error
!= 0; next
++)
2230 ASSERT(next
<= tgts
[current
+ 1]);
2233 * If that spot is available, we're done here.
2235 if (next
!= tgts
[current
+ 1])
2239 * Otherwise, find the next valid column after
2240 * the previous position.
2242 for (c
= tgts
[current
- 1] + 1;
2243 rm
->rm_col
[c
].rc_error
!= 0; c
++)
2249 } while (current
!= n
);
2254 for (i
= 0; i
< n
; i
++) {
2255 zio_buf_free(orig
[i
], rm
->rm_col
[0].rc_size
);
2262 * Complete an IO operation on a RAIDZ VDev
2265 * - For write operations:
2266 * 1. Check for errors on the child IOs.
2267 * 2. Return, setting an error code if too few child VDevs were written
2268 * to reconstruct the data later. Note that partial writes are
2269 * considered successful if they can be reconstructed at all.
2270 * - For read operations:
2271 * 1. Check for errors on the child IOs.
2272 * 2. If data errors occurred:
2273 * a. Try to reassemble the data from the parity available.
2274 * b. If we haven't yet read the parity drives, read them now.
2275 * c. If all parity drives have been read but the data still doesn't
2276 * reassemble with a correct checksum, then try combinatorial
2278 * d. If that doesn't work, return an error.
2279 * 3. If there were unexpected errors or this is a resilver operation,
2280 * rewrite the vdevs that had errors.
2283 vdev_raidz_io_done(zio_t
*zio
)
2285 vdev_t
*vd
= zio
->io_vd
;
2287 raidz_map_t
*rm
= zio
->io_vsd
;
2289 int unexpected_errors
= 0;
2290 int parity_errors
= 0;
2291 int parity_untried
= 0;
2292 int data_errors
= 0;
2293 int total_errors
= 0;
2295 int tgts
[VDEV_RAIDZ_MAXPARITY
];
2298 ASSERT(zio
->io_bp
!= NULL
); /* XXX need to add code to enforce this */
2300 ASSERT(rm
->rm_missingparity
<= rm
->rm_firstdatacol
);
2301 ASSERT(rm
->rm_missingdata
<= rm
->rm_cols
- rm
->rm_firstdatacol
);
2303 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2304 rc
= &rm
->rm_col
[c
];
2307 ASSERT(rc
->rc_error
!= ECKSUM
); /* child has no bp */
2309 if (c
< rm
->rm_firstdatacol
)
2314 if (!rc
->rc_skipped
)
2315 unexpected_errors
++;
2318 } else if (c
< rm
->rm_firstdatacol
&& !rc
->rc_tried
) {
2323 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
2325 * XXX -- for now, treat partial writes as a success.
2326 * (If we couldn't write enough columns to reconstruct
2327 * the data, the I/O failed. Otherwise, good enough.)
2329 * Now that we support write reallocation, it would be better
2330 * to treat partial failure as real failure unless there are
2331 * no non-degraded top-level vdevs left, and not update DTLs
2332 * if we intend to reallocate.
2335 if (total_errors
> rm
->rm_firstdatacol
)
2336 zio
->io_error
= vdev_raidz_worst_error(rm
);
2341 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
2343 * There are three potential phases for a read:
2344 * 1. produce valid data from the columns read
2345 * 2. read all disks and try again
2346 * 3. perform combinatorial reconstruction
2348 * Each phase is progressively both more expensive and less likely to
2349 * occur. If we encounter more errors than we can repair or all phases
2350 * fail, we have no choice but to return an error.
2354 * If the number of errors we saw was correctable -- less than or equal
2355 * to the number of parity disks read -- attempt to produce data that
2356 * has a valid checksum. Naturally, this case applies in the absence of
2359 if (total_errors
<= rm
->rm_firstdatacol
- parity_untried
) {
2360 if (data_errors
== 0) {
2361 if (raidz_checksum_verify(zio
) == 0) {
2363 * If we read parity information (unnecessarily
2364 * as it happens since no reconstruction was
2365 * needed) regenerate and verify the parity.
2366 * We also regenerate parity when resilvering
2367 * so we can write it out to the failed device
2370 if (parity_errors
+ parity_untried
<
2371 rm
->rm_firstdatacol
||
2372 (zio
->io_flags
& ZIO_FLAG_RESILVER
)) {
2373 n
= raidz_parity_verify(zio
, rm
);
2374 unexpected_errors
+= n
;
2375 ASSERT(parity_errors
+ n
<=
2376 rm
->rm_firstdatacol
);
2382 * We either attempt to read all the parity columns or
2383 * none of them. If we didn't try to read parity, we
2384 * wouldn't be here in the correctable case. There must
2385 * also have been fewer parity errors than parity
2386 * columns or, again, we wouldn't be in this code path.
2388 ASSERT(parity_untried
== 0);
2389 ASSERT(parity_errors
< rm
->rm_firstdatacol
);
2392 * Identify the data columns that reported an error.
2395 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
2396 rc
= &rm
->rm_col
[c
];
2397 if (rc
->rc_error
!= 0) {
2398 ASSERT(n
< VDEV_RAIDZ_MAXPARITY
);
2403 ASSERT(rm
->rm_firstdatacol
>= n
);
2405 code
= vdev_raidz_reconstruct(rm
, tgts
, n
);
2407 if (raidz_checksum_verify(zio
) == 0) {
2408 atomic_inc_64(&raidz_corrected
[code
]);
2411 * If we read more parity disks than were used
2412 * for reconstruction, confirm that the other
2413 * parity disks produced correct data. This
2414 * routine is suboptimal in that it regenerates
2415 * the parity that we already used in addition
2416 * to the parity that we're attempting to
2417 * verify, but this should be a relatively
2418 * uncommon case, and can be optimized if it
2419 * becomes a problem. Note that we regenerate
2420 * parity when resilvering so we can write it
2421 * out to failed devices later.
2423 if (parity_errors
< rm
->rm_firstdatacol
- n
||
2424 (zio
->io_flags
& ZIO_FLAG_RESILVER
)) {
2425 n
= raidz_parity_verify(zio
, rm
);
2426 unexpected_errors
+= n
;
2427 ASSERT(parity_errors
+ n
<=
2428 rm
->rm_firstdatacol
);
2437 * This isn't a typical situation -- either we got a read error or
2438 * a child silently returned bad data. Read every block so we can
2439 * try again with as much data and parity as we can track down. If
2440 * we've already been through once before, all children will be marked
2441 * as tried so we'll proceed to combinatorial reconstruction.
2443 unexpected_errors
= 1;
2444 rm
->rm_missingdata
= 0;
2445 rm
->rm_missingparity
= 0;
2447 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2448 if (rm
->rm_col
[c
].rc_tried
)
2451 zio_vdev_io_redone(zio
);
2453 rc
= &rm
->rm_col
[c
];
2456 zio_nowait(zio_vdev_child_io(zio
, NULL
,
2457 vd
->vdev_child
[rc
->rc_devidx
],
2458 rc
->rc_offset
, rc
->rc_abd
, rc
->rc_size
,
2459 zio
->io_type
, zio
->io_priority
, 0,
2460 vdev_raidz_child_done
, rc
));
2461 } while (++c
< rm
->rm_cols
);
2467 * At this point we've attempted to reconstruct the data given the
2468 * errors we detected, and we've attempted to read all columns. There
2469 * must, therefore, be one or more additional problems -- silent errors
2470 * resulting in invalid data rather than explicit I/O errors resulting
2471 * in absent data. We check if there is enough additional data to
2472 * possibly reconstruct the data and then perform combinatorial
2473 * reconstruction over all possible combinations. If that fails,
2476 if (total_errors
> rm
->rm_firstdatacol
) {
2477 zio
->io_error
= vdev_raidz_worst_error(rm
);
2479 } else if (total_errors
< rm
->rm_firstdatacol
&&
2480 (code
= vdev_raidz_combrec(zio
, total_errors
, data_errors
)) != 0) {
2482 * If we didn't use all the available parity for the
2483 * combinatorial reconstruction, verify that the remaining
2484 * parity is correct.
2486 if (code
!= (1 << rm
->rm_firstdatacol
) - 1)
2487 (void) raidz_parity_verify(zio
, rm
);
2490 * We're here because either:
2492 * total_errors == rm_first_datacol, or
2493 * vdev_raidz_combrec() failed
2495 * In either case, there is enough bad data to prevent
2498 * Start checksum ereports for all children which haven't
2499 * failed, and the IO wasn't speculative.
2501 zio
->io_error
= SET_ERROR(ECKSUM
);
2503 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
2504 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2505 rc
= &rm
->rm_col
[c
];
2506 if (rc
->rc_error
== 0) {
2507 zio_bad_cksum_t zbc
;
2508 zbc
.zbc_has_cksum
= 0;
2510 rm
->rm_ecksuminjected
;
2512 zfs_ereport_start_checksum(
2514 vd
->vdev_child
[rc
->rc_devidx
],
2515 zio
, rc
->rc_offset
, rc
->rc_size
,
2516 (void *)(uintptr_t)c
, &zbc
);
2523 zio_checksum_verified(zio
);
2525 if (zio
->io_error
== 0 && spa_writeable(zio
->io_spa
) &&
2526 (unexpected_errors
|| (zio
->io_flags
& ZIO_FLAG_RESILVER
))) {
2528 * Use the good data we have in hand to repair damaged children.
2530 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2531 rc
= &rm
->rm_col
[c
];
2532 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
2534 if (rc
->rc_error
== 0)
2537 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
2538 rc
->rc_offset
, rc
->rc_abd
, rc
->rc_size
,
2539 ZIO_TYPE_WRITE
, ZIO_PRIORITY_ASYNC_WRITE
,
2540 ZIO_FLAG_IO_REPAIR
| (unexpected_errors
?
2541 ZIO_FLAG_SELF_HEAL
: 0), NULL
, NULL
));
2547 vdev_raidz_state_change(vdev_t
*vd
, int faulted
, int degraded
)
2549 if (faulted
> vd
->vdev_nparity
)
2550 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
2551 VDEV_AUX_NO_REPLICAS
);
2552 else if (degraded
+ faulted
!= 0)
2553 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
, VDEV_AUX_NONE
);
2555 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_HEALTHY
, VDEV_AUX_NONE
);
2558 vdev_ops_t vdev_raidz_ops
= {
2562 vdev_raidz_io_start
,
2564 vdev_raidz_state_change
,
2567 VDEV_TYPE_RAIDZ
, /* name of this vdev type */
2568 B_FALSE
/* not a leaf vdev */