Remove some emacs variable settings.
[dragonfly.git] / sys / dev / raid / vinum / vinumrevive.c
blobce5f7839272bea8d6b81dee339c7afd3f4a9bb81
1 /*-
2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
7 * Written by Greg Lehey
9 * This software is distributed under the so-called ``Berkeley
10 * License'':
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
23 * Services Limited.
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
40 * $Id: vinumrevive.c,v 1.14 2000/12/21 01:55:11 grog Exp grog $
41 * $FreeBSD: src/sys/dev/vinum/vinumrevive.c,v 1.22.2.5 2001/03/13 02:59:43 grog Exp $
44 #include "vinumhdr.h"
45 #include "request.h"
48 * Revive a block of a subdisk. Return an error
49 * indication. EAGAIN means successful copy, but
50 * that more blocks remain to be copied. EINVAL
51 * means that the subdisk isn't associated with a
52 * plex (which means a programming error if we get
53 * here at all; FIXME).
56 int
57 revive_block(int sdno)
59 struct sd *sd;
60 struct plex *plex;
61 struct volume *vol;
62 struct buf *bp;
63 cdev_t dev;
64 int error = EAGAIN;
65 int size; /* size of revive block, bytes */
66 vinum_off_t plexblkno; /* lblkno in plex */
67 int psd; /* parity subdisk number */
68 u_int64_t stripe; /* stripe number */
69 int paritysd = 0; /* set if this is the parity stripe */
70 struct rangelock *lock; /* for locking */
71 vinum_off_t stripeoffset; /* offset in stripe */
73 plexblkno = 0; /* to keep the compiler happy */
74 sd = &SD[sdno];
75 lock = NULL;
76 if (sd->plexno < 0) /* no plex? */
77 return EINVAL;
78 plex = &PLEX[sd->plexno]; /* point to plex */
79 if (plex->volno >= 0)
80 vol = &VOL[plex->volno];
81 else
82 vol = NULL;
84 if ((sd->revive_blocksize == 0) /* no block size */
85 ||(sd->revive_blocksize & ((1 << DEV_BSHIFT) - 1))) /* or invalid block size */
86 sd->revive_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
87 else if (sd->revive_blocksize > MAX_REVIVE_BLOCKSIZE)
88 sd->revive_blocksize = MAX_REVIVE_BLOCKSIZE;
89 size = u64min(sd->revive_blocksize >> DEV_BSHIFT, sd->sectors - sd->revived) << DEV_BSHIFT;
90 sd->reviver = curproc->p_pid; /* note who last had a bash at it */
92 /* Now decide where to read from */
93 switch (plex->organization) {
94 case plex_concat:
95 plexblkno = sd->revived + sd->plexoffset; /* corresponding address in plex */
96 break;
98 case plex_striped:
99 stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
100 if (stripeoffset + (size >> DEV_BSHIFT) > plex->stripesize)
101 size = (plex->stripesize - stripeoffset) << DEV_BSHIFT;
102 plexblkno = sd->plexoffset /* base */
103 + (sd->revived - stripeoffset) * plex->subdisks /* offset to beginning of stripe */
104 + stripeoffset; /* offset from beginning of stripe */
105 break;
107 case plex_raid4:
108 case plex_raid5:
109 stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */
110 plexblkno = sd->plexoffset /* base */
111 + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */
112 +stripeoffset; /* offset from beginning of stripe */
113 stripe = (sd->revived / plex->stripesize); /* stripe number */
115 /* Make sure we don't go beyond the end of the band. */
116 size = u64min(size, (plex->stripesize - stripeoffset) << DEV_BSHIFT);
117 if (plex->organization == plex_raid4)
118 psd = plex->subdisks - 1; /* parity subdisk for this stripe */
119 else
120 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
121 paritysd = plex->sdnos[psd] == sdno; /* note if it's the parity subdisk */
124 * Now adjust for the strangenesses
125 * in RAID-4 and RAID-5 striping.
127 if (sd->plexsdno > psd) /* beyond the parity stripe, */
128 plexblkno -= plex->stripesize; /* one stripe less */
129 else if (paritysd)
130 plexblkno -= plex->stripesize * sd->plexsdno; /* go back to the beginning of the band */
131 break;
133 case plex_disorg: /* to keep the compiler happy */
134 break;
137 if (paritysd) { /* we're reviving a parity block, */
138 bp = parityrebuild(plex, sd->revived, size, rebuildparity, &lock, NULL); /* do the grunt work */
139 if (bp == NULL) /* no buffer space */
140 return ENOMEM; /* chicken out */
141 } else { /* data block */
142 bp = getpbuf(&vinum_conf.physbufs); /* Get a buffer */
143 bp->b_data = Malloc(size);
146 * Amount to transfer: block size, unless it
147 * would overlap the end.
149 bp->b_bcount = size;
150 bp->b_resid = bp->b_bcount;
151 bp->b_bio1.bio_offset = (off_t)plexblkno << DEV_BSHIFT; /* start here */
152 bp->b_bio1.bio_done = biodone_sync;
153 bp->b_bio1.bio_flags |= BIO_SYNC;
154 if (isstriped(plex)) /* we need to lock striped plexes */
155 lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */
156 if (vol != NULL) /* it's part of a volume, */
158 * First, read the data from the volume. We
159 * don't care which plex, that's bre's job.
161 dev = vol->vol_dev;
162 else /* it's an unattached plex */
163 dev = PLEX[sd->plexno].plex_dev;
165 bp->b_cmd = BUF_CMD_READ;
166 vinumstart(dev, &bp->b_bio1, 1);
167 biowait(&bp->b_bio1, "drvrd");
170 if (bp->b_flags & B_ERROR)
171 error = bp->b_error;
172 else
173 /* Now write to the subdisk */
175 dev = SD[sdno].sd_dev;
176 KKASSERT(dev != NULL);
177 bp->b_flags |= B_ORDERED; /* and make this an ordered write */
178 bp->b_cmd = BUF_CMD_WRITE;
179 bp->b_resid = bp->b_bcount;
180 bp->b_bio1.bio_offset = (off_t)sd->revived << DEV_BSHIFT; /* write it to here */
181 bp->b_bio1.bio_driver_info = dev;
182 bp->b_bio1.bio_done = biodone_sync;
183 sdio(&bp->b_bio1); /* perform the I/O */
184 biowait(&bp->b_bio1, "drvwr");
185 if (bp->b_flags & B_ERROR)
186 error = bp->b_error;
187 else {
188 sd->revived += bp->b_bcount >> DEV_BSHIFT; /* moved this much further down */
189 if (sd->revived >= sd->sectors) { /* finished */
190 sd->revived = 0;
191 set_sd_state(sdno, sd_up, setstate_force); /* bring the sd up */
192 log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
193 save_config(); /* and save the updated configuration */
194 error = 0; /* we're done */
197 if (lock) /* we took a lock, */
198 unlockrange(sd->plexno, lock); /* give it back */
199 while (sd->waitlist) { /* we have waiting requests */
200 #if VINUMDEBUG
201 struct request *rq = sd->waitlist;
202 cdev_t dev;
204 if (debug & DEBUG_REVIVECONFLICT) {
205 dev = rq->bio->bio_driver_info;
206 log(LOG_DEBUG,
207 "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%llx, length %d\n",
208 rq->sdno,
210 (rq->bio->bio_buf->b_cmd == BUF_CMD_READ) ? "Read" : "Write",
211 major(dev),
212 minor(dev),
213 rq->bio->bio_offset,
214 rq->bio->bio_buf->b_bcount);
216 #endif
217 launch_requests(sd->waitlist, 1); /* do them now */
218 sd->waitlist = sd->waitlist->next; /* and move on to the next */
221 Free(bp->b_data);
222 relpbuf(bp, &vinum_conf.physbufs);
223 return error;
227 * Check or rebuild the parity blocks of a RAID-4
228 * or RAID-5 plex.
230 * The variables plex->checkblock and
231 * plex->rebuildblock represent the
232 * subdisk-relative address of the stripe we're
233 * looking at, not the plex-relative address. We
234 * store it in the plex and not as a local
235 * variable because this function could be
236 * stopped, and we don't want to repeat the part
237 * we've already done. This is also the reason
238 * why we don't initialize it here except at the
239 * end. It gets initialized with the plex on
240 * creation.
242 * Each call to this function processes at most
243 * one stripe. We can't loop in this function,
244 * because we're unstoppable, so we have to be
245 * called repeatedly from userland.
247 void
248 parityops(struct vinum_ioctl_msg *data)
250 int plexno;
251 struct plex *plex;
252 int size; /* I/O transfer size, bytes */
253 vinum_off_t stripe; /* stripe number in plex */
254 int psd; /* parity subdisk number */
255 struct rangelock *lock; /* lock on stripe */
256 struct _ioctl_reply *reply;
257 off_t pstripe; /* pointer to our stripe counter */
258 struct buf *pbp;
259 off_t errorloc; /* offset of parity error */
260 enum parityop op; /* operation to perform */
262 plexno = data->index;
263 op = data->op;
264 pbp = NULL;
265 reply = (struct _ioctl_reply *) data;
266 reply->error = EAGAIN; /* expect to repeat this call */
267 plex = &PLEX[plexno];
268 if (!isparity(plex)) { /* not RAID-4 or RAID-5 */
269 reply->error = EINVAL;
270 return;
271 } else if (plex->state < plex_flaky) {
272 reply->error = EIO;
273 strcpy(reply->msg, "Plex is not completely accessible\n");
274 return;
276 pstripe = data->offset;
277 stripe = pstripe / plex->stripesize; /* stripe number */
278 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
279 size = imin(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
280 plex->stripesize << DEV_BSHIFT);
282 pbp = parityrebuild(plex, pstripe, size, op, &lock, &errorloc); /* do the grunt work */
283 if (pbp == NULL) { /* no buffer space */
284 reply->error = ENOMEM;
285 return; /* chicken out */
288 * Now we have a result in the data buffer of
289 * the parity buffer header, which we have kept.
290 * Decide what to do with it.
292 reply->msg[0] = '\0'; /* until shown otherwise */
293 if ((pbp->b_flags & B_ERROR) == 0) { /* no error */
294 if ((op == rebuildparity)
295 || (op == rebuildandcheckparity)) {
296 pbp->b_cmd = BUF_CMD_WRITE;
297 pbp->b_resid = pbp->b_bcount;
298 pbp->b_bio1.bio_done = biodone_sync;
299 sdio(&pbp->b_bio1); /* write the parity block */
300 biowait(&pbp->b_bio1, "drvwr");
302 if (((op == checkparity)
303 || (op == rebuildandcheckparity))
304 && (errorloc != -1)) {
305 if (op == checkparity)
306 reply->error = EIO;
307 ksprintf(reply->msg,
308 "Parity incorrect at offset 0x%llx\n",
309 (long long)errorloc);
311 if (reply->error == EAGAIN) { /* still OK, */
312 plex->checkblock = pstripe + (pbp->b_bcount >> DEV_BSHIFT); /* moved this much further down */
313 if (plex->checkblock >= SD[plex->sdnos[0]].sectors) { /* finished */
314 plex->checkblock = 0;
315 reply->error = 0;
319 if (pbp->b_flags & B_ERROR)
320 reply->error = pbp->b_error;
321 Free(pbp->b_data);
322 relpbuf(pbp, &vinum_conf.physbufs);
323 unlockrange(plexno, lock);
327 * Rebuild a parity stripe. Return pointer to
328 * parity bp. On return,
330 * 1. The band is locked. The caller must unlock
331 * the band and release the buffer header.
333 * 2. All buffer headers except php have been
334 * released. The caller must release pbp.
336 * 3. For checkparity and rebuildandcheckparity,
337 * the parity is compared with the current
338 * parity block. If it's different, the
339 * offset of the error is returned to
340 * errorloc. The caller can set the value of
341 * the pointer to NULL if this is called for
342 * rebuilding parity.
344 * pstripe is the subdisk-relative base address of
345 * the data to be reconstructed, size is the size
346 * of the transfer in bytes.
348 struct buf *
349 parityrebuild(struct plex *plex,
350 vinum_off_t pstripe,
351 int size,
352 enum parityop op,
353 struct rangelock **lockp,
354 off_t * errorloc)
356 int error;
357 int sdno;
358 u_int64_t stripe; /* stripe number */
359 int *parity_buf; /* buffer address for current parity block */
360 int *newparity_buf; /* and for new parity block */
361 int mysize; /* I/O transfer size for this transfer */
362 int isize; /* mysize in ints */
363 int i;
364 int psd; /* parity subdisk number */
365 int newpsd; /* and "subdisk number" of new parity */
366 struct buf **bpp; /* pointers to our bps */
367 struct buf *pbp; /* buffer header for parity stripe */
368 int *sbuf;
369 int bufcount; /* number of buffers we need */
371 stripe = pstripe / plex->stripesize; /* stripe number */
372 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */
373 parity_buf = NULL; /* to keep the compiler happy */
374 error = 0;
377 * It's possible that the default transfer size
378 * we chose is not a factor of the stripe size.
379 * We *must* limit this operation to a single
380 * stripe, at least for RAID-5 rebuild, since
381 * the parity subdisk changes between stripes,
382 * so in this case we need to perform a short
383 * transfer. Set variable mysize to reflect
384 * this.
386 mysize = u64min(size, (plex->stripesize * (stripe + 1) - pstripe) << DEV_BSHIFT);
387 isize = mysize / (sizeof(int)); /* number of ints in the buffer */
388 bufcount = plex->subdisks + 1; /* sd buffers plus result buffer */
389 newpsd = plex->subdisks;
390 bpp = (struct buf **) Malloc(bufcount * sizeof(struct buf *)); /* array of pointers to bps */
392 /* First, build requests for all subdisks */
393 for (sdno = 0; sdno < bufcount; sdno++) { /* for each subdisk */
394 if ((sdno != psd) || (op != rebuildparity)) {
395 /* Get a buffer header and initialize it. */
396 bpp[sdno] = getpbuf(&vinum_conf.physbufs); /* Get a buffer */
397 bpp[sdno]->b_data = Malloc(mysize);
398 if (sdno == psd)
399 parity_buf = (int *) bpp[sdno]->b_data;
400 if (sdno == newpsd) /* the new one? */
401 bpp[sdno]->b_bio1.bio_driver_info = SD[plex->sdnos[psd]].sd_dev; /* write back to the parity SD */
402 else
403 bpp[sdno]->b_bio1.bio_driver_info = SD[plex->sdnos[sdno]].sd_dev; /* device number */
404 KKASSERT(bpp[sdno]->b_bio1.bio_driver_info);
405 bpp[sdno]->b_cmd = BUF_CMD_READ; /* either way, read it */
406 bpp[sdno]->b_bcount = mysize;
407 bpp[sdno]->b_resid = bpp[sdno]->b_bcount;
408 bpp[sdno]->b_bio1.bio_offset = (off_t)pstripe << DEV_BSHIFT; /* transfer from here */
409 bpp[sdno]->b_bio1.bio_done = biodone_sync;
413 /* Initialize result buffer */
414 pbp = bpp[newpsd];
415 newparity_buf = (int *) bpp[newpsd]->b_data;
416 bzero(newparity_buf, mysize);
419 * Now lock the stripe with the first non-parity
420 * bp as locking bp.
422 *lockp = lockrange(pstripe * plex->stripesize * (plex->subdisks - 1),
423 bpp[psd ? 0 : 1],
424 plex);
427 * Then issue requests for all subdisks in
428 * parallel. Don't transfer the parity stripe
429 * if we're rebuilding parity, unless we also
430 * want to check it.
432 for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each real subdisk */
433 if ((sdno != psd) || (op != rebuildparity)) {
434 sdio(&bpp[sdno]->b_bio1);
439 * Next, wait for the requests to complete.
440 * We wait in the order in which they were
441 * issued, which isn't necessarily the order in
442 * which they complete, but we don't have a
443 * convenient way of doing the latter, and the
444 * delay is minimal.
446 for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each subdisk */
447 if ((sdno != psd) || (op != rebuildparity)) {
448 biowait(&bpp[sdno]->b_bio1, "drvio");
449 if (bpp[sdno]->b_flags & B_ERROR) /* can't read, */
450 error = bpp[sdno]->b_error;
451 else if (sdno != psd) { /* update parity */
452 sbuf = (int *) bpp[sdno]->b_data;
453 for (i = 0; i < isize; i++)
454 newparity_buf[i] ^= sbuf[i]; /* xor in the buffer */
457 if (sdno != psd) { /* release all bps except parity */
458 Free(bpp[sdno]->b_data);
459 relpbuf(bpp[sdno], &vinum_conf.physbufs); /* give back our resources */
464 * If we're checking, compare the calculated
465 * and the read parity block. If they're
466 * different, return the plex-relative offset;
467 * otherwise return -1.
469 if ((op == checkparity)
470 || (op == rebuildandcheckparity)) {
471 *errorloc = -1; /* no error yet */
472 for (i = 0; i < isize; i++) {
473 if (parity_buf[i] != newparity_buf[i]) {
474 *errorloc = (off_t) (pstripe << DEV_BSHIFT) * (plex->subdisks - 1)
475 + i * sizeof(int);
476 break;
479 Free(bpp[psd]->b_data);
480 relpbuf(bpp[psd], &vinum_conf.physbufs); /* give back our resources */
482 /* release our resources */
483 Free(bpp);
484 if (error) {
485 pbp->b_flags |= B_ERROR;
486 pbp->b_error = error;
488 return pbp;
492 * Initialize a subdisk by writing zeroes to the
493 * complete address space. If verify is set,
494 * check each transfer for correctness.
496 * Each call to this function writes (and maybe
497 * checks) a single block.
500 initsd(int sdno, int verify)
502 struct sd *sd;
503 struct plex *plex;
504 struct volume *vol;
505 struct buf *bp;
506 int error;
507 int size; /* size of init block, bytes */
508 vinum_off_t plexblkno; /* lblkno in plex */
509 int verified; /* set when we're happy with what we wrote */
511 error = 0;
512 plexblkno = 0; /* to keep the compiler happy */
513 sd = &SD[sdno];
514 if (sd->plexno < 0) /* no plex? */
515 return EINVAL;
516 plex = &PLEX[sd->plexno]; /* point to plex */
517 if (plex->volno >= 0)
518 vol = &VOL[plex->volno];
519 else
520 vol = NULL;
522 if (sd->init_blocksize == 0) {
523 if (plex->stripesize != 0) /* we're striped, don't init more than */
524 sd->init_blocksize = u64min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */
525 plex->stripesize << DEV_BSHIFT);
526 else
527 sd->init_blocksize = DEFAULT_REVIVE_BLOCKSIZE;
528 } else if (sd->init_blocksize > MAX_REVIVE_BLOCKSIZE)
529 sd->init_blocksize = MAX_REVIVE_BLOCKSIZE;
531 size = u64min(sd->init_blocksize >> DEV_BSHIFT, sd->sectors - sd->initialized) << DEV_BSHIFT;
533 bp = getpbuf(&vinum_conf.physbufs); /* Get a buffer */
534 bp->b_data = Malloc(size);
536 verified = 0;
537 while (!verified) { /* until we're happy with it, */
538 bp->b_bcount = size;
539 bp->b_resid = bp->b_bcount;
540 bp->b_bio1.bio_offset = (off_t)sd->initialized << DEV_BSHIFT; /* write it to here */
541 bp->b_bio1.bio_driver_info = SD[sdno].sd_dev;
542 bp->b_bio1.bio_done = biodone_sync;
543 KKASSERT(bp->b_bio1.bio_driver_info);
544 bzero(bp->b_data, bp->b_bcount);
545 bp->b_cmd = BUF_CMD_WRITE;
546 sdio(&bp->b_bio1); /* perform the I/O */
547 biowait(&bp->b_bio1, "drvwr");
548 if (bp->b_flags & B_ERROR)
549 error = bp->b_error;
550 if ((error == 0) && verify) { /* check that it got there */
551 bp->b_bcount = size;
552 bp->b_resid = bp->b_bcount;
553 bp->b_bio1.bio_offset = (off_t)sd->initialized << DEV_BSHIFT; /* read from here */
554 bp->b_bio1.bio_driver_info = SD[sdno].sd_dev;
555 bp->b_bio1.bio_done = biodone_sync;
556 KKASSERT(bp->b_bio1.bio_driver_info);
557 bp->b_cmd = BUF_CMD_READ; /* read it back */
558 sdio(&bp->b_bio1);
559 biowait(&bp->b_bio1, "drvrd");
561 * XXX Bug fix code. This is hopefully no
562 * longer needed (21 February 2000).
564 if (bp->b_flags & B_ERROR)
565 error = bp->b_error;
566 else if ((*bp->b_data != 0) /* first word spammed */
567 ||(bcmp(bp->b_data, &bp->b_data[1], bp->b_bcount - 1))) { /* or one of the others */
568 kprintf("vinum: init error on %s, offset 0x%llx sectors\n",
569 sd->name,
570 (long long) sd->initialized);
571 verified = 0;
572 } else
573 verified = 1;
574 } else
575 verified = 1;
577 Free(bp->b_data);
578 relpbuf(bp, &vinum_conf.physbufs);
579 if (error == 0) { /* did it, */
580 sd->initialized += size >> DEV_BSHIFT; /* moved this much further down */
581 if (sd->initialized >= sd->sectors) { /* finished */
582 sd->initialized = 0;
583 set_sd_state(sdno, sd_initialized, setstate_force); /* bring the sd up */
584 log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state));
585 save_config(); /* and save the updated configuration */
586 } else /* more to go, */
587 error = EAGAIN; /* ya'll come back, see? */
589 return error;