2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
7 * Written by Greg Lehey
9 * This software is distributed under the so-called ``Berkeley
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
40 * $Id: vinumrevive.c,v 1.14 2000/12/21 01:55:11 grog Exp grog $
41 * $FreeBSD: src/sys/dev/vinum/vinumrevive.c,v 1.22.2.5 2001/03/13 02:59:43 grog Exp $
48 * Revive a block of a subdisk. Return an error
49 * indication. EAGAIN means successful copy, but
50 * that more blocks remain to be copied. EINVAL
51 * means that the subdisk isn't associated with a
52 * plex (which means a programming error if we get
53 * here at all; FIXME).
57 revive_block(int sdno
)
65 int size
; /* size of revive block, bytes */
66 vinum_off_t plexblkno
; /* lblkno in plex */
67 int psd
; /* parity subdisk number */
68 u_int64_t stripe
; /* stripe number */
69 int paritysd
= 0; /* set if this is the parity stripe */
70 struct rangelock
*lock
; /* for locking */
71 vinum_off_t stripeoffset
; /* offset in stripe */
73 plexblkno
= 0; /* to keep the compiler happy */
76 if (sd
->plexno
< 0) /* no plex? */
78 plex
= &PLEX
[sd
->plexno
]; /* point to plex */
80 vol
= &VOL
[plex
->volno
];
84 if ((sd
->revive_blocksize
== 0) /* no block size */
85 ||(sd
->revive_blocksize
& ((1 << DEV_BSHIFT
) - 1))) /* or invalid block size */
86 sd
->revive_blocksize
= DEFAULT_REVIVE_BLOCKSIZE
;
87 else if (sd
->revive_blocksize
> MAX_REVIVE_BLOCKSIZE
)
88 sd
->revive_blocksize
= MAX_REVIVE_BLOCKSIZE
;
89 size
= u64min(sd
->revive_blocksize
>> DEV_BSHIFT
, sd
->sectors
- sd
->revived
) << DEV_BSHIFT
;
90 sd
->reviver
= curproc
->p_pid
; /* note who last had a bash at it */
92 /* Now decide where to read from */
93 switch (plex
->organization
) {
95 plexblkno
= sd
->revived
+ sd
->plexoffset
; /* corresponding address in plex */
99 stripeoffset
= sd
->revived
% plex
->stripesize
; /* offset from beginning of stripe */
100 if (stripeoffset
+ (size
>> DEV_BSHIFT
) > plex
->stripesize
)
101 size
= (plex
->stripesize
- stripeoffset
) << DEV_BSHIFT
;
102 plexblkno
= sd
->plexoffset
/* base */
103 + (sd
->revived
- stripeoffset
) * plex
->subdisks
/* offset to beginning of stripe */
104 + stripeoffset
; /* offset from beginning of stripe */
109 stripeoffset
= sd
->revived
% plex
->stripesize
; /* offset from beginning of stripe */
110 plexblkno
= sd
->plexoffset
/* base */
111 + (sd
->revived
- stripeoffset
) * (plex
->subdisks
- 1) /* offset to beginning of stripe */
112 +stripeoffset
; /* offset from beginning of stripe */
113 stripe
= (sd
->revived
/ plex
->stripesize
); /* stripe number */
115 /* Make sure we don't go beyond the end of the band. */
116 size
= u64min(size
, (plex
->stripesize
- stripeoffset
) << DEV_BSHIFT
);
117 if (plex
->organization
== plex_raid4
)
118 psd
= plex
->subdisks
- 1; /* parity subdisk for this stripe */
120 psd
= plex
->subdisks
- 1 - stripe
% plex
->subdisks
; /* parity subdisk for this stripe */
121 paritysd
= plex
->sdnos
[psd
] == sdno
; /* note if it's the parity subdisk */
124 * Now adjust for the strangenesses
125 * in RAID-4 and RAID-5 striping.
127 if (sd
->plexsdno
> psd
) /* beyond the parity stripe, */
128 plexblkno
-= plex
->stripesize
; /* one stripe less */
130 plexblkno
-= plex
->stripesize
* sd
->plexsdno
; /* go back to the beginning of the band */
133 case plex_disorg
: /* to keep the compiler happy */
137 if (paritysd
) { /* we're reviving a parity block, */
138 bp
= parityrebuild(plex
, sd
->revived
, size
, rebuildparity
, &lock
, NULL
); /* do the grunt work */
139 if (bp
== NULL
) /* no buffer space */
140 return ENOMEM
; /* chicken out */
141 } else { /* data block */
142 bp
= getpbuf(&vinum_conf
.physbufs
); /* Get a buffer */
143 bp
->b_data
= Malloc(size
);
146 * Amount to transfer: block size, unless it
147 * would overlap the end.
150 bp
->b_resid
= bp
->b_bcount
;
151 bp
->b_bio1
.bio_offset
= (off_t
)plexblkno
<< DEV_BSHIFT
; /* start here */
152 bp
->b_bio1
.bio_done
= biodone_sync
;
153 bp
->b_bio1
.bio_flags
|= BIO_SYNC
;
154 if (isstriped(plex
)) /* we need to lock striped plexes */
155 lock
= lockrange(plexblkno
<< DEV_BSHIFT
, bp
, plex
); /* lock it */
156 if (vol
!= NULL
) /* it's part of a volume, */
158 * First, read the data from the volume. We
159 * don't care which plex, that's bre's job.
162 else /* it's an unattached plex */
163 dev
= PLEX
[sd
->plexno
].plex_dev
;
165 bp
->b_cmd
= BUF_CMD_READ
;
166 vinumstart(dev
, &bp
->b_bio1
, 1);
167 biowait(&bp
->b_bio1
, "drvrd");
170 if (bp
->b_flags
& B_ERROR
)
173 /* Now write to the subdisk */
175 dev
= SD
[sdno
].sd_dev
;
176 KKASSERT(dev
!= NULL
);
177 bp
->b_flags
|= B_ORDERED
; /* and make this an ordered write */
178 bp
->b_cmd
= BUF_CMD_WRITE
;
179 bp
->b_resid
= bp
->b_bcount
;
180 bp
->b_bio1
.bio_offset
= (off_t
)sd
->revived
<< DEV_BSHIFT
; /* write it to here */
181 bp
->b_bio1
.bio_driver_info
= dev
;
182 bp
->b_bio1
.bio_done
= biodone_sync
;
183 sdio(&bp
->b_bio1
); /* perform the I/O */
184 biowait(&bp
->b_bio1
, "drvwr");
185 if (bp
->b_flags
& B_ERROR
)
188 sd
->revived
+= bp
->b_bcount
>> DEV_BSHIFT
; /* moved this much further down */
189 if (sd
->revived
>= sd
->sectors
) { /* finished */
191 set_sd_state(sdno
, sd_up
, setstate_force
); /* bring the sd up */
192 log(LOG_INFO
, "vinum: %s is %s\n", sd
->name
, sd_state(sd
->state
));
193 save_config(); /* and save the updated configuration */
194 error
= 0; /* we're done */
197 if (lock
) /* we took a lock, */
198 unlockrange(sd
->plexno
, lock
); /* give it back */
199 while (sd
->waitlist
) { /* we have waiting requests */
201 struct request
*rq
= sd
->waitlist
;
204 if (debug
& DEBUG_REVIVECONFLICT
) {
205 dev
= rq
->bio
->bio_driver_info
;
207 "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%llx, length %d\n",
210 (rq
->bio
->bio_buf
->b_cmd
== BUF_CMD_READ
) ? "Read" : "Write",
214 rq
->bio
->bio_buf
->b_bcount
);
217 launch_requests(sd
->waitlist
, 1); /* do them now */
218 sd
->waitlist
= sd
->waitlist
->next
; /* and move on to the next */
222 relpbuf(bp
, &vinum_conf
.physbufs
);
227 * Check or rebuild the parity blocks of a RAID-4
230 * The variables plex->checkblock and
231 * plex->rebuildblock represent the
232 * subdisk-relative address of the stripe we're
233 * looking at, not the plex-relative address. We
234 * store it in the plex and not as a local
235 * variable because this function could be
236 * stopped, and we don't want to repeat the part
237 * we've already done. This is also the reason
238 * why we don't initialize it here except at the
239 * end. It gets initialized with the plex on
242 * Each call to this function processes at most
243 * one stripe. We can't loop in this function,
244 * because we're unstoppable, so we have to be
245 * called repeatedly from userland.
248 parityops(struct vinum_ioctl_msg
*data
)
252 int size
; /* I/O transfer size, bytes */
253 vinum_off_t stripe
; /* stripe number in plex */
254 int psd
; /* parity subdisk number */
255 struct rangelock
*lock
; /* lock on stripe */
256 struct _ioctl_reply
*reply
;
257 off_t pstripe
; /* pointer to our stripe counter */
259 off_t errorloc
; /* offset of parity error */
260 enum parityop op
; /* operation to perform */
262 plexno
= data
->index
;
265 reply
= (struct _ioctl_reply
*) data
;
266 reply
->error
= EAGAIN
; /* expect to repeat this call */
267 plex
= &PLEX
[plexno
];
268 if (!isparity(plex
)) { /* not RAID-4 or RAID-5 */
269 reply
->error
= EINVAL
;
271 } else if (plex
->state
< plex_flaky
) {
273 strcpy(reply
->msg
, "Plex is not completely accessible\n");
276 pstripe
= data
->offset
;
277 stripe
= pstripe
/ plex
->stripesize
; /* stripe number */
278 psd
= plex
->subdisks
- 1 - stripe
% plex
->subdisks
; /* parity subdisk for this stripe */
279 size
= imin(DEFAULT_REVIVE_BLOCKSIZE
, /* one block at a time */
280 plex
->stripesize
<< DEV_BSHIFT
);
282 pbp
= parityrebuild(plex
, pstripe
, size
, op
, &lock
, &errorloc
); /* do the grunt work */
283 if (pbp
== NULL
) { /* no buffer space */
284 reply
->error
= ENOMEM
;
285 return; /* chicken out */
288 * Now we have a result in the data buffer of
289 * the parity buffer header, which we have kept.
290 * Decide what to do with it.
292 reply
->msg
[0] = '\0'; /* until shown otherwise */
293 if ((pbp
->b_flags
& B_ERROR
) == 0) { /* no error */
294 if ((op
== rebuildparity
)
295 || (op
== rebuildandcheckparity
)) {
296 pbp
->b_cmd
= BUF_CMD_WRITE
;
297 pbp
->b_resid
= pbp
->b_bcount
;
298 pbp
->b_bio1
.bio_done
= biodone_sync
;
299 sdio(&pbp
->b_bio1
); /* write the parity block */
300 biowait(&pbp
->b_bio1
, "drvwr");
302 if (((op
== checkparity
)
303 || (op
== rebuildandcheckparity
))
304 && (errorloc
!= -1)) {
305 if (op
== checkparity
)
308 "Parity incorrect at offset 0x%llx\n",
309 (long long)errorloc
);
311 if (reply
->error
== EAGAIN
) { /* still OK, */
312 plex
->checkblock
= pstripe
+ (pbp
->b_bcount
>> DEV_BSHIFT
); /* moved this much further down */
313 if (plex
->checkblock
>= SD
[plex
->sdnos
[0]].sectors
) { /* finished */
314 plex
->checkblock
= 0;
319 if (pbp
->b_flags
& B_ERROR
)
320 reply
->error
= pbp
->b_error
;
322 relpbuf(pbp
, &vinum_conf
.physbufs
);
323 unlockrange(plexno
, lock
);
327 * Rebuild a parity stripe. Return pointer to
328 * parity bp. On return,
330 * 1. The band is locked. The caller must unlock
331 * the band and release the buffer header.
333 * 2. All buffer headers except php have been
334 * released. The caller must release pbp.
336 * 3. For checkparity and rebuildandcheckparity,
337 * the parity is compared with the current
338 * parity block. If it's different, the
339 * offset of the error is returned to
340 * errorloc. The caller can set the value of
341 * the pointer to NULL if this is called for
344 * pstripe is the subdisk-relative base address of
345 * the data to be reconstructed, size is the size
346 * of the transfer in bytes.
349 parityrebuild(struct plex
*plex
,
353 struct rangelock
**lockp
,
358 u_int64_t stripe
; /* stripe number */
359 int *parity_buf
; /* buffer address for current parity block */
360 int *newparity_buf
; /* and for new parity block */
361 int mysize
; /* I/O transfer size for this transfer */
362 int isize
; /* mysize in ints */
364 int psd
; /* parity subdisk number */
365 int newpsd
; /* and "subdisk number" of new parity */
366 struct buf
**bpp
; /* pointers to our bps */
367 struct buf
*pbp
; /* buffer header for parity stripe */
369 int bufcount
; /* number of buffers we need */
371 stripe
= pstripe
/ plex
->stripesize
; /* stripe number */
372 psd
= plex
->subdisks
- 1 - stripe
% plex
->subdisks
; /* parity subdisk for this stripe */
373 parity_buf
= NULL
; /* to keep the compiler happy */
377 * It's possible that the default transfer size
378 * we chose is not a factor of the stripe size.
379 * We *must* limit this operation to a single
380 * stripe, at least for RAID-5 rebuild, since
381 * the parity subdisk changes between stripes,
382 * so in this case we need to perform a short
383 * transfer. Set variable mysize to reflect
386 mysize
= u64min(size
, (plex
->stripesize
* (stripe
+ 1) - pstripe
) << DEV_BSHIFT
);
387 isize
= mysize
/ (sizeof(int)); /* number of ints in the buffer */
388 bufcount
= plex
->subdisks
+ 1; /* sd buffers plus result buffer */
389 newpsd
= plex
->subdisks
;
390 bpp
= (struct buf
**) Malloc(bufcount
* sizeof(struct buf
*)); /* array of pointers to bps */
392 /* First, build requests for all subdisks */
393 for (sdno
= 0; sdno
< bufcount
; sdno
++) { /* for each subdisk */
394 if ((sdno
!= psd
) || (op
!= rebuildparity
)) {
395 /* Get a buffer header and initialize it. */
396 bpp
[sdno
] = getpbuf(&vinum_conf
.physbufs
); /* Get a buffer */
397 bpp
[sdno
]->b_data
= Malloc(mysize
);
399 parity_buf
= (int *) bpp
[sdno
]->b_data
;
400 if (sdno
== newpsd
) /* the new one? */
401 bpp
[sdno
]->b_bio1
.bio_driver_info
= SD
[plex
->sdnos
[psd
]].sd_dev
; /* write back to the parity SD */
403 bpp
[sdno
]->b_bio1
.bio_driver_info
= SD
[plex
->sdnos
[sdno
]].sd_dev
; /* device number */
404 KKASSERT(bpp
[sdno
]->b_bio1
.bio_driver_info
);
405 bpp
[sdno
]->b_cmd
= BUF_CMD_READ
; /* either way, read it */
406 bpp
[sdno
]->b_bcount
= mysize
;
407 bpp
[sdno
]->b_resid
= bpp
[sdno
]->b_bcount
;
408 bpp
[sdno
]->b_bio1
.bio_offset
= (off_t
)pstripe
<< DEV_BSHIFT
; /* transfer from here */
409 bpp
[sdno
]->b_bio1
.bio_done
= biodone_sync
;
413 /* Initialize result buffer */
415 newparity_buf
= (int *) bpp
[newpsd
]->b_data
;
416 bzero(newparity_buf
, mysize
);
419 * Now lock the stripe with the first non-parity
422 *lockp
= lockrange(pstripe
* plex
->stripesize
* (plex
->subdisks
- 1),
427 * Then issue requests for all subdisks in
428 * parallel. Don't transfer the parity stripe
429 * if we're rebuilding parity, unless we also
432 for (sdno
= 0; sdno
< plex
->subdisks
; sdno
++) { /* for each real subdisk */
433 if ((sdno
!= psd
) || (op
!= rebuildparity
)) {
434 sdio(&bpp
[sdno
]->b_bio1
);
439 * Next, wait for the requests to complete.
440 * We wait in the order in which they were
441 * issued, which isn't necessarily the order in
442 * which they complete, but we don't have a
443 * convenient way of doing the latter, and the
446 for (sdno
= 0; sdno
< plex
->subdisks
; sdno
++) { /* for each subdisk */
447 if ((sdno
!= psd
) || (op
!= rebuildparity
)) {
448 biowait(&bpp
[sdno
]->b_bio1
, "drvio");
449 if (bpp
[sdno
]->b_flags
& B_ERROR
) /* can't read, */
450 error
= bpp
[sdno
]->b_error
;
451 else if (sdno
!= psd
) { /* update parity */
452 sbuf
= (int *) bpp
[sdno
]->b_data
;
453 for (i
= 0; i
< isize
; i
++)
454 newparity_buf
[i
] ^= sbuf
[i
]; /* xor in the buffer */
457 if (sdno
!= psd
) { /* release all bps except parity */
458 Free(bpp
[sdno
]->b_data
);
459 relpbuf(bpp
[sdno
], &vinum_conf
.physbufs
); /* give back our resources */
464 * If we're checking, compare the calculated
465 * and the read parity block. If they're
466 * different, return the plex-relative offset;
467 * otherwise return -1.
469 if ((op
== checkparity
)
470 || (op
== rebuildandcheckparity
)) {
471 *errorloc
= -1; /* no error yet */
472 for (i
= 0; i
< isize
; i
++) {
473 if (parity_buf
[i
] != newparity_buf
[i
]) {
474 *errorloc
= (off_t
) (pstripe
<< DEV_BSHIFT
) * (plex
->subdisks
- 1)
479 Free(bpp
[psd
]->b_data
);
480 relpbuf(bpp
[psd
], &vinum_conf
.physbufs
); /* give back our resources */
482 /* release our resources */
485 pbp
->b_flags
|= B_ERROR
;
486 pbp
->b_error
= error
;
492 * Initialize a subdisk by writing zeroes to the
493 * complete address space. If verify is set,
494 * check each transfer for correctness.
496 * Each call to this function writes (and maybe
497 * checks) a single block.
500 initsd(int sdno
, int verify
)
507 int size
; /* size of init block, bytes */
508 vinum_off_t plexblkno
; /* lblkno in plex */
509 int verified
; /* set when we're happy with what we wrote */
512 plexblkno
= 0; /* to keep the compiler happy */
514 if (sd
->plexno
< 0) /* no plex? */
516 plex
= &PLEX
[sd
->plexno
]; /* point to plex */
517 if (plex
->volno
>= 0)
518 vol
= &VOL
[plex
->volno
];
522 if (sd
->init_blocksize
== 0) {
523 if (plex
->stripesize
!= 0) /* we're striped, don't init more than */
524 sd
->init_blocksize
= u64min(DEFAULT_REVIVE_BLOCKSIZE
, /* one block at a time */
525 plex
->stripesize
<< DEV_BSHIFT
);
527 sd
->init_blocksize
= DEFAULT_REVIVE_BLOCKSIZE
;
528 } else if (sd
->init_blocksize
> MAX_REVIVE_BLOCKSIZE
)
529 sd
->init_blocksize
= MAX_REVIVE_BLOCKSIZE
;
531 size
= u64min(sd
->init_blocksize
>> DEV_BSHIFT
, sd
->sectors
- sd
->initialized
) << DEV_BSHIFT
;
533 bp
= getpbuf(&vinum_conf
.physbufs
); /* Get a buffer */
534 bp
->b_data
= Malloc(size
);
537 while (!verified
) { /* until we're happy with it, */
539 bp
->b_resid
= bp
->b_bcount
;
540 bp
->b_bio1
.bio_offset
= (off_t
)sd
->initialized
<< DEV_BSHIFT
; /* write it to here */
541 bp
->b_bio1
.bio_driver_info
= SD
[sdno
].sd_dev
;
542 bp
->b_bio1
.bio_done
= biodone_sync
;
543 KKASSERT(bp
->b_bio1
.bio_driver_info
);
544 bzero(bp
->b_data
, bp
->b_bcount
);
545 bp
->b_cmd
= BUF_CMD_WRITE
;
546 sdio(&bp
->b_bio1
); /* perform the I/O */
547 biowait(&bp
->b_bio1
, "drvwr");
548 if (bp
->b_flags
& B_ERROR
)
550 if ((error
== 0) && verify
) { /* check that it got there */
552 bp
->b_resid
= bp
->b_bcount
;
553 bp
->b_bio1
.bio_offset
= (off_t
)sd
->initialized
<< DEV_BSHIFT
; /* read from here */
554 bp
->b_bio1
.bio_driver_info
= SD
[sdno
].sd_dev
;
555 bp
->b_bio1
.bio_done
= biodone_sync
;
556 KKASSERT(bp
->b_bio1
.bio_driver_info
);
557 bp
->b_cmd
= BUF_CMD_READ
; /* read it back */
559 biowait(&bp
->b_bio1
, "drvrd");
561 * XXX Bug fix code. This is hopefully no
562 * longer needed (21 February 2000).
564 if (bp
->b_flags
& B_ERROR
)
566 else if ((*bp
->b_data
!= 0) /* first word spammed */
567 ||(bcmp(bp
->b_data
, &bp
->b_data
[1], bp
->b_bcount
- 1))) { /* or one of the others */
568 kprintf("vinum: init error on %s, offset 0x%llx sectors\n",
570 (long long) sd
->initialized
);
578 relpbuf(bp
, &vinum_conf
.physbufs
);
579 if (error
== 0) { /* did it, */
580 sd
->initialized
+= size
>> DEV_BSHIFT
; /* moved this much further down */
581 if (sd
->initialized
>= sd
->sectors
) { /* finished */
583 set_sd_state(sdno
, sd_initialized
, setstate_force
); /* bring the sd up */
584 log(LOG_INFO
, "vinum: %s is %s\n", sd
->name
, sd_state(sd
->state
));
585 save_config(); /* and save the updated configuration */
586 } else /* more to go, */
587 error
= EAGAIN
; /* ya'll come back, see? */