2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
7 * Written by Greg Lehey
9 * This software is distributed under the so-called ``Berkeley
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
40 * $Id: vinumrevive.c,v 1.14 2000/12/21 01:55:11 grog Exp grog $
41 * $FreeBSD: src/sys/dev/vinum/vinumrevive.c,v 1.22.2.5 2001/03/13 02:59:43 grog Exp $
42 * $DragonFly: src/sys/dev/raid/vinum/vinumrevive.c,v 1.15 2006/12/22 23:26:24 swildner Exp $
49 * Revive a block of a subdisk. Return an error
50 * indication. EAGAIN means successful copy, but
51 * that more blocks remain to be copied. EINVAL
52 * means that the subdisk isn't associated with a
53 * plex (which means a programming error if we get
54 * here at all; FIXME).
58 revive_block(int sdno
)
66 int size
; /* size of revive block, bytes */
67 vinum_off_t plexblkno
; /* lblkno in plex */
68 int psd
; /* parity subdisk number */
69 u_int64_t stripe
; /* stripe number */
70 int paritysd
= 0; /* set if this is the parity stripe */
71 struct rangelock
*lock
; /* for locking */
72 vinum_off_t stripeoffset
; /* offset in stripe */
74 plexblkno
= 0; /* to keep the compiler happy */
77 if (sd
->plexno
< 0) /* no plex? */
79 plex
= &PLEX
[sd
->plexno
]; /* point to plex */
81 vol
= &VOL
[plex
->volno
];
85 if ((sd
->revive_blocksize
== 0) /* no block size */
86 ||(sd
->revive_blocksize
& ((1 << DEV_BSHIFT
) - 1))) /* or invalid block size */
87 sd
->revive_blocksize
= DEFAULT_REVIVE_BLOCKSIZE
;
88 else if (sd
->revive_blocksize
> MAX_REVIVE_BLOCKSIZE
)
89 sd
->revive_blocksize
= MAX_REVIVE_BLOCKSIZE
;
90 size
= u64min(sd
->revive_blocksize
>> DEV_BSHIFT
, sd
->sectors
- sd
->revived
) << DEV_BSHIFT
;
91 sd
->reviver
= curproc
->p_pid
; /* note who last had a bash at it */
93 /* Now decide where to read from */
94 switch (plex
->organization
) {
96 plexblkno
= sd
->revived
+ sd
->plexoffset
; /* corresponding address in plex */
100 stripeoffset
= sd
->revived
% plex
->stripesize
; /* offset from beginning of stripe */
101 if (stripeoffset
+ (size
>> DEV_BSHIFT
) > plex
->stripesize
)
102 size
= (plex
->stripesize
- stripeoffset
) << DEV_BSHIFT
;
103 plexblkno
= sd
->plexoffset
/* base */
104 + (sd
->revived
- stripeoffset
) * plex
->subdisks
/* offset to beginning of stripe */
105 + stripeoffset
; /* offset from beginning of stripe */
110 stripeoffset
= sd
->revived
% plex
->stripesize
; /* offset from beginning of stripe */
111 plexblkno
= sd
->plexoffset
/* base */
112 + (sd
->revived
- stripeoffset
) * (plex
->subdisks
- 1) /* offset to beginning of stripe */
113 +stripeoffset
; /* offset from beginning of stripe */
114 stripe
= (sd
->revived
/ plex
->stripesize
); /* stripe number */
116 /* Make sure we don't go beyond the end of the band. */
117 size
= u64min(size
, (plex
->stripesize
- stripeoffset
) << DEV_BSHIFT
);
118 if (plex
->organization
== plex_raid4
)
119 psd
= plex
->subdisks
- 1; /* parity subdisk for this stripe */
121 psd
= plex
->subdisks
- 1 - stripe
% plex
->subdisks
; /* parity subdisk for this stripe */
122 paritysd
= plex
->sdnos
[psd
] == sdno
; /* note if it's the parity subdisk */
125 * Now adjust for the strangenesses
126 * in RAID-4 and RAID-5 striping.
128 if (sd
->plexsdno
> psd
) /* beyond the parity stripe, */
129 plexblkno
-= plex
->stripesize
; /* one stripe less */
131 plexblkno
-= plex
->stripesize
* sd
->plexsdno
; /* go back to the beginning of the band */
134 case plex_disorg
: /* to keep the compiler happy */
138 if (paritysd
) { /* we're reviving a parity block, */
139 bp
= parityrebuild(plex
, sd
->revived
, size
, rebuildparity
, &lock
, NULL
); /* do the grunt work */
140 if (bp
== NULL
) /* no buffer space */
141 return ENOMEM
; /* chicken out */
142 } else { /* data block */
143 bp
= getpbuf(&vinum_conf
.physbufs
); /* Get a buffer */
144 bp
->b_data
= Malloc(size
);
147 * Amount to transfer: block size, unless it
148 * would overlap the end.
151 bp
->b_resid
= bp
->b_bcount
;
152 bp
->b_bio1
.bio_offset
= (off_t
)plexblkno
<< DEV_BSHIFT
; /* start here */
153 bp
->b_bio1
.bio_done
= biodone_sync
;
154 bp
->b_bio1
.bio_flags
|= BIO_SYNC
;
155 if (isstriped(plex
)) /* we need to lock striped plexes */
156 lock
= lockrange(plexblkno
<< DEV_BSHIFT
, bp
, plex
); /* lock it */
157 if (vol
!= NULL
) /* it's part of a volume, */
159 * First, read the data from the volume. We
160 * don't care which plex, that's bre's job.
163 else /* it's an unattached plex */
164 dev
= PLEX
[sd
->plexno
].plex_dev
;
166 bp
->b_cmd
= BUF_CMD_READ
;
167 vinumstart(dev
, &bp
->b_bio1
, 1);
168 biowait(&bp
->b_bio1
, "drvrd");
171 if (bp
->b_flags
& B_ERROR
)
174 /* Now write to the subdisk */
176 dev
= SD
[sdno
].sd_dev
;
177 KKASSERT(dev
!= NULL
);
178 bp
->b_flags
|= B_ORDERED
; /* and make this an ordered write */
179 bp
->b_cmd
= BUF_CMD_WRITE
;
180 bp
->b_resid
= bp
->b_bcount
;
181 bp
->b_bio1
.bio_offset
= (off_t
)sd
->revived
<< DEV_BSHIFT
; /* write it to here */
182 bp
->b_bio1
.bio_driver_info
= dev
;
183 bp
->b_bio1
.bio_done
= biodone_sync
;
184 sdio(&bp
->b_bio1
); /* perform the I/O */
185 biowait(&bp
->b_bio1
, "drvwr");
186 if (bp
->b_flags
& B_ERROR
)
189 sd
->revived
+= bp
->b_bcount
>> DEV_BSHIFT
; /* moved this much further down */
190 if (sd
->revived
>= sd
->sectors
) { /* finished */
192 set_sd_state(sdno
, sd_up
, setstate_force
); /* bring the sd up */
193 log(LOG_INFO
, "vinum: %s is %s\n", sd
->name
, sd_state(sd
->state
));
194 save_config(); /* and save the updated configuration */
195 error
= 0; /* we're done */
198 if (lock
) /* we took a lock, */
199 unlockrange(sd
->plexno
, lock
); /* give it back */
200 while (sd
->waitlist
) { /* we have waiting requests */
202 struct request
*rq
= sd
->waitlist
;
205 if (debug
& DEBUG_REVIVECONFLICT
) {
206 dev
= rq
->bio
->bio_driver_info
;
208 "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%llx, length %d\n",
211 (rq
->bio
->bio_buf
->b_cmd
== BUF_CMD_READ
) ? "Read" : "Write",
215 rq
->bio
->bio_buf
->b_bcount
);
218 launch_requests(sd
->waitlist
, 1); /* do them now */
219 sd
->waitlist
= sd
->waitlist
->next
; /* and move on to the next */
223 relpbuf(bp
, &vinum_conf
.physbufs
);
228 * Check or rebuild the parity blocks of a RAID-4
231 * The variables plex->checkblock and
232 * plex->rebuildblock represent the
233 * subdisk-relative address of the stripe we're
234 * looking at, not the plex-relative address. We
235 * store it in the plex and not as a local
236 * variable because this function could be
237 * stopped, and we don't want to repeat the part
238 * we've already done. This is also the reason
239 * why we don't initialize it here except at the
240 * end. It gets initialized with the plex on
243 * Each call to this function processes at most
244 * one stripe. We can't loop in this function,
245 * because we're unstoppable, so we have to be
246 * called repeatedly from userland.
249 parityops(struct vinum_ioctl_msg
*data
)
253 int size
; /* I/O transfer size, bytes */
254 vinum_off_t stripe
; /* stripe number in plex */
255 int psd
; /* parity subdisk number */
256 struct rangelock
*lock
; /* lock on stripe */
257 struct _ioctl_reply
*reply
;
258 off_t pstripe
; /* pointer to our stripe counter */
260 off_t errorloc
; /* offset of parity error */
261 enum parityop op
; /* operation to perform */
263 plexno
= data
->index
;
266 reply
= (struct _ioctl_reply
*) data
;
267 reply
->error
= EAGAIN
; /* expect to repeat this call */
268 plex
= &PLEX
[plexno
];
269 if (!isparity(plex
)) { /* not RAID-4 or RAID-5 */
270 reply
->error
= EINVAL
;
272 } else if (plex
->state
< plex_flaky
) {
274 strcpy(reply
->msg
, "Plex is not completely accessible\n");
277 pstripe
= data
->offset
;
278 stripe
= pstripe
/ plex
->stripesize
; /* stripe number */
279 psd
= plex
->subdisks
- 1 - stripe
% plex
->subdisks
; /* parity subdisk for this stripe */
280 size
= imin(DEFAULT_REVIVE_BLOCKSIZE
, /* one block at a time */
281 plex
->stripesize
<< DEV_BSHIFT
);
283 pbp
= parityrebuild(plex
, pstripe
, size
, op
, &lock
, &errorloc
); /* do the grunt work */
284 if (pbp
== NULL
) { /* no buffer space */
285 reply
->error
= ENOMEM
;
286 return; /* chicken out */
289 * Now we have a result in the data buffer of
290 * the parity buffer header, which we have kept.
291 * Decide what to do with it.
293 reply
->msg
[0] = '\0'; /* until shown otherwise */
294 if ((pbp
->b_flags
& B_ERROR
) == 0) { /* no error */
295 if ((op
== rebuildparity
)
296 || (op
== rebuildandcheckparity
)) {
297 pbp
->b_cmd
= BUF_CMD_WRITE
;
298 pbp
->b_resid
= pbp
->b_bcount
;
299 pbp
->b_bio1
.bio_done
= biodone_sync
;
300 sdio(&pbp
->b_bio1
); /* write the parity block */
301 biowait(&pbp
->b_bio1
, "drvwr");
303 if (((op
== checkparity
)
304 || (op
== rebuildandcheckparity
))
305 && (errorloc
!= -1)) {
306 if (op
== checkparity
)
309 "Parity incorrect at offset 0x%llx\n",
310 (long long)errorloc
);
312 if (reply
->error
== EAGAIN
) { /* still OK, */
313 plex
->checkblock
= pstripe
+ (pbp
->b_bcount
>> DEV_BSHIFT
); /* moved this much further down */
314 if (plex
->checkblock
>= SD
[plex
->sdnos
[0]].sectors
) { /* finished */
315 plex
->checkblock
= 0;
320 if (pbp
->b_flags
& B_ERROR
)
321 reply
->error
= pbp
->b_error
;
323 relpbuf(pbp
, &vinum_conf
.physbufs
);
324 unlockrange(plexno
, lock
);
328 * Rebuild a parity stripe. Return pointer to
329 * parity bp. On return,
331 * 1. The band is locked. The caller must unlock
332 * the band and release the buffer header.
334 * 2. All buffer headers except php have been
335 * released. The caller must release pbp.
337 * 3. For checkparity and rebuildandcheckparity,
338 * the parity is compared with the current
339 * parity block. If it's different, the
340 * offset of the error is returned to
341 * errorloc. The caller can set the value of
342 * the pointer to NULL if this is called for
345 * pstripe is the subdisk-relative base address of
346 * the data to be reconstructed, size is the size
347 * of the transfer in bytes.
350 parityrebuild(struct plex
*plex
,
354 struct rangelock
**lockp
,
359 u_int64_t stripe
; /* stripe number */
360 int *parity_buf
; /* buffer address for current parity block */
361 int *newparity_buf
; /* and for new parity block */
362 int mysize
; /* I/O transfer size for this transfer */
363 int isize
; /* mysize in ints */
365 int psd
; /* parity subdisk number */
366 int newpsd
; /* and "subdisk number" of new parity */
367 struct buf
**bpp
; /* pointers to our bps */
368 struct buf
*pbp
; /* buffer header for parity stripe */
370 int bufcount
; /* number of buffers we need */
372 stripe
= pstripe
/ plex
->stripesize
; /* stripe number */
373 psd
= plex
->subdisks
- 1 - stripe
% plex
->subdisks
; /* parity subdisk for this stripe */
374 parity_buf
= NULL
; /* to keep the compiler happy */
378 * It's possible that the default transfer size
379 * we chose is not a factor of the stripe size.
380 * We *must* limit this operation to a single
381 * stripe, at least for RAID-5 rebuild, since
382 * the parity subdisk changes between stripes,
383 * so in this case we need to perform a short
384 * transfer. Set variable mysize to reflect
387 mysize
= u64min(size
, (plex
->stripesize
* (stripe
+ 1) - pstripe
) << DEV_BSHIFT
);
388 isize
= mysize
/ (sizeof(int)); /* number of ints in the buffer */
389 bufcount
= plex
->subdisks
+ 1; /* sd buffers plus result buffer */
390 newpsd
= plex
->subdisks
;
391 bpp
= (struct buf
**) Malloc(bufcount
* sizeof(struct buf
*)); /* array of pointers to bps */
393 /* First, build requests for all subdisks */
394 for (sdno
= 0; sdno
< bufcount
; sdno
++) { /* for each subdisk */
395 if ((sdno
!= psd
) || (op
!= rebuildparity
)) {
396 /* Get a buffer header and initialize it. */
397 bpp
[sdno
] = getpbuf(&vinum_conf
.physbufs
); /* Get a buffer */
398 bpp
[sdno
]->b_data
= Malloc(mysize
);
400 parity_buf
= (int *) bpp
[sdno
]->b_data
;
401 if (sdno
== newpsd
) /* the new one? */
402 bpp
[sdno
]->b_bio1
.bio_driver_info
= SD
[plex
->sdnos
[psd
]].sd_dev
; /* write back to the parity SD */
404 bpp
[sdno
]->b_bio1
.bio_driver_info
= SD
[plex
->sdnos
[sdno
]].sd_dev
; /* device number */
405 KKASSERT(bpp
[sdno
]->b_bio1
.bio_driver_info
);
406 bpp
[sdno
]->b_cmd
= BUF_CMD_READ
; /* either way, read it */
407 bpp
[sdno
]->b_bcount
= mysize
;
408 bpp
[sdno
]->b_resid
= bpp
[sdno
]->b_bcount
;
409 bpp
[sdno
]->b_bio1
.bio_offset
= (off_t
)pstripe
<< DEV_BSHIFT
; /* transfer from here */
410 bpp
[sdno
]->b_bio1
.bio_done
= biodone_sync
;
414 /* Initialize result buffer */
416 newparity_buf
= (int *) bpp
[newpsd
]->b_data
;
417 bzero(newparity_buf
, mysize
);
420 * Now lock the stripe with the first non-parity
423 *lockp
= lockrange(pstripe
* plex
->stripesize
* (plex
->subdisks
- 1),
428 * Then issue requests for all subdisks in
429 * parallel. Don't transfer the parity stripe
430 * if we're rebuilding parity, unless we also
433 for (sdno
= 0; sdno
< plex
->subdisks
; sdno
++) { /* for each real subdisk */
434 if ((sdno
!= psd
) || (op
!= rebuildparity
)) {
435 sdio(&bpp
[sdno
]->b_bio1
);
440 * Next, wait for the requests to complete.
441 * We wait in the order in which they were
442 * issued, which isn't necessarily the order in
443 * which they complete, but we don't have a
444 * convenient way of doing the latter, and the
447 for (sdno
= 0; sdno
< plex
->subdisks
; sdno
++) { /* for each subdisk */
448 if ((sdno
!= psd
) || (op
!= rebuildparity
)) {
449 biowait(&bpp
[sdno
]->b_bio1
, "drvio");
450 if (bpp
[sdno
]->b_flags
& B_ERROR
) /* can't read, */
451 error
= bpp
[sdno
]->b_error
;
452 else if (sdno
!= psd
) { /* update parity */
453 sbuf
= (int *) bpp
[sdno
]->b_data
;
454 for (i
= 0; i
< isize
; i
++)
455 ((int *) newparity_buf
)[i
] ^= sbuf
[i
]; /* xor in the buffer */
458 if (sdno
!= psd
) { /* release all bps except parity */
459 Free(bpp
[sdno
]->b_data
);
460 relpbuf(bpp
[sdno
], &vinum_conf
.physbufs
); /* give back our resources */
465 * If we're checking, compare the calculated
466 * and the read parity block. If they're
467 * different, return the plex-relative offset;
468 * otherwise return -1.
470 if ((op
== checkparity
)
471 || (op
== rebuildandcheckparity
)) {
472 *errorloc
= -1; /* no error yet */
473 for (i
= 0; i
< isize
; i
++) {
474 if (parity_buf
[i
] != newparity_buf
[i
]) {
475 *errorloc
= (off_t
) (pstripe
<< DEV_BSHIFT
) * (plex
->subdisks
- 1)
480 Free(bpp
[psd
]->b_data
);
481 relpbuf(bpp
[psd
], &vinum_conf
.physbufs
); /* give back our resources */
483 /* release our resources */
486 pbp
->b_flags
|= B_ERROR
;
487 pbp
->b_error
= error
;
493 * Initialize a subdisk by writing zeroes to the
494 * complete address space. If verify is set,
495 * check each transfer for correctness.
497 * Each call to this function writes (and maybe
498 * checks) a single block.
501 initsd(int sdno
, int verify
)
508 int size
; /* size of init block, bytes */
509 vinum_off_t plexblkno
; /* lblkno in plex */
510 int verified
; /* set when we're happy with what we wrote */
513 plexblkno
= 0; /* to keep the compiler happy */
515 if (sd
->plexno
< 0) /* no plex? */
517 plex
= &PLEX
[sd
->plexno
]; /* point to plex */
518 if (plex
->volno
>= 0)
519 vol
= &VOL
[plex
->volno
];
523 if (sd
->init_blocksize
== 0) {
524 if (plex
->stripesize
!= 0) /* we're striped, don't init more than */
525 sd
->init_blocksize
= u64min(DEFAULT_REVIVE_BLOCKSIZE
, /* one block at a time */
526 plex
->stripesize
<< DEV_BSHIFT
);
528 sd
->init_blocksize
= DEFAULT_REVIVE_BLOCKSIZE
;
529 } else if (sd
->init_blocksize
> MAX_REVIVE_BLOCKSIZE
)
530 sd
->init_blocksize
= MAX_REVIVE_BLOCKSIZE
;
532 size
= u64min(sd
->init_blocksize
>> DEV_BSHIFT
, sd
->sectors
- sd
->initialized
) << DEV_BSHIFT
;
534 bp
= getpbuf(&vinum_conf
.physbufs
); /* Get a buffer */
535 bp
->b_data
= Malloc(size
);
538 while (!verified
) { /* until we're happy with it, */
540 bp
->b_resid
= bp
->b_bcount
;
541 bp
->b_bio1
.bio_offset
= (off_t
)sd
->initialized
<< DEV_BSHIFT
; /* write it to here */
542 bp
->b_bio1
.bio_driver_info
= SD
[sdno
].sd_dev
;
543 bp
->b_bio1
.bio_done
= biodone_sync
;
544 KKASSERT(bp
->b_bio1
.bio_driver_info
);
545 bzero(bp
->b_data
, bp
->b_bcount
);
546 bp
->b_cmd
= BUF_CMD_WRITE
;
547 sdio(&bp
->b_bio1
); /* perform the I/O */
548 biowait(&bp
->b_bio1
, "drvwr");
549 if (bp
->b_flags
& B_ERROR
)
551 if ((error
== 0) && verify
) { /* check that it got there */
553 bp
->b_resid
= bp
->b_bcount
;
554 bp
->b_bio1
.bio_offset
= (off_t
)sd
->initialized
<< DEV_BSHIFT
; /* read from here */
555 bp
->b_bio1
.bio_driver_info
= SD
[sdno
].sd_dev
;
556 bp
->b_bio1
.bio_done
= biodone_sync
;
557 KKASSERT(bp
->b_bio1
.bio_driver_info
);
558 bp
->b_cmd
= BUF_CMD_READ
; /* read it back */
560 biowait(&bp
->b_bio1
, "drvrd");
562 * XXX Bug fix code. This is hopefully no
563 * longer needed (21 February 2000).
565 if (bp
->b_flags
& B_ERROR
)
567 else if ((*bp
->b_data
!= 0) /* first word spammed */
568 ||(bcmp(bp
->b_data
, &bp
->b_data
[1], bp
->b_bcount
- 1))) { /* or one of the others */
569 kprintf("vinum: init error on %s, offset 0x%llx sectors\n",
571 (long long) sd
->initialized
);
579 relpbuf(bp
, &vinum_conf
.physbufs
);
580 if (error
== 0) { /* did it, */
581 sd
->initialized
+= size
>> DEV_BSHIFT
; /* moved this much further down */
582 if (sd
->initialized
>= sd
->sectors
) { /* finished */
584 set_sd_state(sdno
, sd_initialized
, setstate_force
); /* bring the sd up */
585 log(LOG_INFO
, "vinum: %s is %s\n", sd
->name
, sd_state(sd
->state
));
586 save_config(); /* and save the updated configuration */
587 } else /* more to go, */
588 error
= EAGAIN
; /* ya'll come back, see? */
593 /* Local Variables: */
594 /* fill-column: 50 */