wg.conf.5: Fix a typo (in-inline comments are *not* allowed)
[dragonfly.git] / sys / dev / raid / vinum / vinuminterrupt.c
blob11063dd1c4aed79579deeb0919dad70a185d7407
1 /* vinuminterrupt.c: bottom half of the driver */
3 /*-
4 * Copyright (c) 1997, 1998, 1999
5 * Nan Yang Computer Services Limited. All rights reserved.
7 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
9 * Written by Greg Lehey
11 * This software is distributed under the so-called ``Berkeley
12 * License'':
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by Nan Yang Computer
25 * Services Limited.
26 * 4. Neither the name of the Company nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
30 * This software is provided ``as is'', and any express or implied
31 * warranties, including, but not limited to, the implied warranties of
32 * merchantability and fitness for a particular purpose are disclaimed.
33 * In no event shall the company or contributors be liable for any
34 * direct, indirect, incidental, special, exemplary, or consequential
35 * damages (including, but not limited to, procurement of substitute
36 * goods or services; loss of use, data, or profits; or business
37 * interruption) however caused and on any theory of liability, whether
38 * in contract, strict liability, or tort (including negligence or
39 * otherwise) arising in any way out of the use of this software, even if
40 * advised of the possibility of such damage.
42 * $Id: vinuminterrupt.c,v 1.12 2000/11/24 03:41:42 grog Exp grog $
43 * $FreeBSD: src/sys/dev/vinum/vinuminterrupt.c,v 1.25.2.3 2001/05/28 05:56:27 grog Exp $
46 #include "vinumhdr.h"
47 #include "request.h"
48 #include <sys/resourcevar.h>
50 void complete_raid5_write(struct rqelement *);
51 void complete_rqe(struct bio *bio);
52 void sdio_done(struct bio *bio);
55 * Take a completed buffer, transfer the data back if
56 * it's a read, and complete the high-level request
57 * if this is the last subrequest.
59 * The bp parameter is in fact a struct rqelement, which
60 * includes a couple of extras at the end.
62 void
63 complete_rqe(struct bio *bio)
65 union daemoninfo di;
66 struct buf *bp = bio->bio_buf;
67 struct rqelement *rqe;
68 struct request *rq;
69 struct rqgroup *rqg;
70 struct bio *ubio; /* user buffer */
71 struct drive *drive;
72 struct sd *sd;
73 char *gravity; /* for error messages */
75 get_mplock();
77 rqe = (struct rqelement *) bp; /* point to the element that completed */
78 rqg = rqe->rqg; /* and the request group */
79 rq = rqg->rq; /* and the complete request */
80 ubio = rq->bio; /* user buffer */
82 #ifdef VINUMDEBUG
83 if (debug & DEBUG_LASTREQS)
84 logrq(loginfo_iodone, (union rqinfou) rqe, ubio);
85 #endif
86 drive = &DRIVE[rqe->driveno];
87 drive->active--; /* one less outstanding I/O on this drive */
88 vinum_conf.active--; /* one less outstanding I/O globally */
89 if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */
90 ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */
91 wakeup(&launch_requests); /* let another one at it */
92 if ((bp->b_flags & B_ERROR) != 0) { /* transfer in error */
93 gravity = "";
94 sd = &SD[rqe->sdno];
96 if (bp->b_error != 0) /* did it return a number? */
97 rq->error = bp->b_error; /* yes, put it in. */
98 else if (rq->error == 0) /* no: do we have one already? */
99 rq->error = EIO; /* no: catchall "I/O error" */
100 sd->lasterror = rq->error;
101 if (bp->b_cmd == BUF_CMD_READ) {
102 if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
103 gravity = " fatal";
104 set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
106 log(LOG_ERR,
107 "%s:%s read error, offset %lld for %d bytes\n",
108 gravity,
109 sd->name,
110 (long long)bio->bio_offset,
111 bp->b_bcount);
112 } else { /* write operation */
113 if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
114 gravity = "fatal ";
115 set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
117 log(LOG_ERR,
118 "%s:%s write error, offset %lld for %d bytes\n",
119 gravity,
120 sd->name,
121 (long long)bio->bio_offset,
122 bp->b_bcount);
124 log(LOG_ERR,
125 "%s: user buffer offset %lld for %d bytes\n",
126 sd->name,
127 (long long)ubio->bio_offset,
128 ubio->bio_buf->b_bcount);
129 if (rq->error == ENXIO) { /* the drive's down too */
130 log(LOG_ERR,
131 "%s: fatal drive I/O error, offset %lld for %d bytes\n",
132 DRIVE[rqe->driveno].label.name,
133 (long long)bio->bio_offset,
134 bp->b_bcount);
135 DRIVE[rqe->driveno].lasterror = rq->error;
136 set_drive_state(rqe->driveno, /* take the drive down */
137 drive_down,
138 setstate_force);
141 /* Now update the statistics */
142 if (bp->b_cmd == BUF_CMD_READ) { /* read operation */
143 DRIVE[rqe->driveno].reads++;
144 DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
145 SD[rqe->sdno].reads++;
146 SD[rqe->sdno].bytes_read += bp->b_bcount;
147 PLEX[rqe->rqg->plexno].reads++;
148 PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
149 if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */
150 VOL[PLEX[rqe->rqg->plexno].volno].reads++;
151 VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
153 } else { /* write operation */
154 DRIVE[rqe->driveno].writes++;
155 DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
156 SD[rqe->sdno].writes++;
157 SD[rqe->sdno].bytes_written += bp->b_bcount;
158 PLEX[rqe->rqg->plexno].writes++;
159 PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
160 if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */
161 VOL[PLEX[rqe->rqg->plexno].volno].writes++;
162 VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
165 if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */
166 int *sdata; /* source */
167 int *data; /* and group data */
168 int length; /* and count involved */
169 int count; /* loop counter */
170 struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */
172 /* XOR destination is the user data */
173 sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
174 data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
175 length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
177 for (count = 0; count < length; count++)
178 data[count] ^= sdata[count];
181 * In a normal read, we will normally read directly
182 * into the user buffer. This doesn't work if
183 * we're also doing a recovery, so we have to
184 * copy it
186 if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */
187 char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
188 char *dst;
190 dst = (char *) ubio->bio_buf->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
191 length = rqe->datalen << DEV_BSHIFT; /* and count involved */
192 bcopy(src, dst, length); /* move it */
194 } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation */
195 &&(rqg->active == 1)) /* and this is the last active request */
196 complete_raid5_write(rqe);
198 * This is the earliest place where we can be
199 * sure that the request has really finished,
200 * since complete_raid5_write can issue new
201 * requests.
203 rqg->active--; /* this request now finished */
204 if (rqg->active == 0) { /* request group finished, */
205 rq->active--; /* one less */
206 if (rqg->lock) { /* got a lock? */
207 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
208 rqg->lock = 0;
211 if (rq->active == 0) { /* request finished, */
212 #ifdef VINUMDEBUG
213 if (debug & DEBUG_RESID) {
214 if (ubio->bio_buf->b_resid != 0) /* still something to transfer? */
215 Debugger("resid");
217 #endif
219 if (rq->error) { /* did we have an error? */
220 if (rq->isplex) { /* plex operation, */
221 ubio->bio_buf->b_flags |= B_ERROR; /* yes, propagate to user */
222 ubio->bio_buf->b_error = rq->error;
223 } else { /* try to recover */
224 di.rq = rq;
225 queue_daemon_request(daemonrq_ioerror, di); /* let the daemon complete */
227 } else {
228 ubio->bio_buf->b_resid = 0; /* completed our transfer */
229 if (rq->isplex == 0) /* volume request, */
230 VOL[rq->volplex.volno].active--; /* another request finished */
231 biodone(ubio); /* top level buffer completed */
232 freerq(rq); /* return the request storage */
235 rel_mplock();
238 /* Free a request block and anything hanging off it */
239 void
240 freerq(struct request *rq)
242 struct rqgroup *rqg;
243 struct rqgroup *nrqg; /* next in chain */
244 int rqno;
246 for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) { /* through the whole request chain */
247 if (rqg->lock) /* got a lock? */
248 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */
249 for (rqno = 0; rqno < rqg->count; rqno++) {
250 if ((rqg->rqe[rqno].flags & XFR_MALLOCED) /* data buffer was malloced, */
251 &&rqg->rqe[rqno].b.b_data) /* and the allocation succeeded */
252 Free(rqg->rqe[rqno].b.b_data); /* free it */
253 if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) { /* locked this buffer, */
254 BUF_UNLOCK(&rqg->rqe[rqno].b); /* unlock it again */
255 uninitbufbio(&rqg->rqe[rqno].b);
258 nrqg = rqg->next; /* note the next one */
259 Free(rqg); /* and free this one */
261 Free(rq); /* free the request itself */
264 /* I/O on subdisk completed */
265 void
266 sdio_done(struct bio *bio)
268 struct sdbuf *sbp;
270 get_mplock();
272 sbp = (struct sdbuf *) bio->bio_buf;
273 if (sbp->b.b_flags & B_ERROR) { /* had an error */
274 sbp->bio->bio_buf->b_flags |= B_ERROR; /* propagate upwards */
275 sbp->bio->bio_buf->b_error = sbp->b.b_error;
277 #ifdef VINUMDEBUG
278 if (debug & DEBUG_LASTREQS)
279 logrq(loginfo_sdiodone, (union rqinfou)bio, bio);
280 #endif
281 sbp->bio->bio_buf->b_resid = sbp->b.b_resid; /* copy the resid field */
282 /* Now update the statistics */
283 if (sbp->b.b_cmd == BUF_CMD_READ) { /* read operation */
284 DRIVE[sbp->driveno].reads++;
285 DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
286 SD[sbp->sdno].reads++;
287 SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
288 } else { /* write operation */
289 DRIVE[sbp->driveno].writes++;
290 DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
291 SD[sbp->sdno].writes++;
292 SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
294 biodone_sync(bio);
295 biodone(sbp->bio); /* complete the caller's I/O */
296 BUF_UNLOCK(&sbp->b);
297 uninitbufbio(&sbp->b);
298 Free(sbp);
299 rel_mplock();
302 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */
303 void
304 complete_raid5_write(struct rqelement *rqe)
306 int *sdata; /* source */
307 int *pdata; /* and parity block data */
308 int length; /* and count involved */
309 int count; /* loop counter */
310 int rqno; /* request index */
311 int rqoffset; /* offset of request data from parity data */
312 struct bio *ubio; /* user buffer header */
313 struct request *rq; /* pointer to our request */
314 struct rqgroup *rqg; /* and to the request group */
315 struct rqelement *prqe; /* point to the parity block */
316 struct drive *drive; /* drive to access */
317 rqg = rqe->rqg; /* and to our request group */
318 rq = rqg->rq; /* point to our request */
319 ubio = rq->bio; /* user's buffer header */
320 prqe = &rqg->rqe[0]; /* point to the parity block */
323 * If we get to this function, we have normal or
324 * degraded writes, or a combination of both. We do
325 * the same thing in each case: we perform an
326 * exclusive or to the parity block. The only
327 * difference is the origin of the data and the
328 * address range.
330 if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */
331 pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
332 bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */
334 /* Now get what data we need from each block */
335 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
336 rqe = &rqg->rqe[rqno]; /* this request */
337 sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
338 length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */
341 * Add the data block to the parity block. Before
342 * we started the request, we zeroed the parity
343 * block, so the result of adding all the other
344 * blocks and the block we want to write will be
345 * the correct parity block.
347 for (count = 0; count < length; count++)
348 pdata[count] ^= sdata[count];
349 if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */
350 &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */
351 Free(rqe->b.b_data); /* free it now */
352 rqe->flags &= ~XFR_MALLOCED;
356 if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */
357 /* Get what data we need from each block */
358 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */
359 rqe = &rqg->rqe[rqno]; /* this request */
360 if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
361 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */
362 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
363 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
364 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
365 length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
368 * "remove" the old data block
369 * from the parity block
371 if ((pdata < ((int *) prqe->b.b_data))
372 || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
373 || (sdata < ((int *) rqe->b.b_data))
374 || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
375 panic("complete_raid5_write: bounds overflow");
376 for (count = 0; count < length; count++)
377 pdata[count] ^= sdata[count];
379 /* "add" the new data block */
380 sdata = (int *) (&ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
381 if ((sdata < ((int *) ubio->bio_buf->b_data))
382 || (&sdata[length] > ((int *) (ubio->bio_buf->b_data + ubio->bio_buf->b_bcount))))
383 panic("complete_raid5_write: bounds overflow");
384 for (count = 0; count < length; count++)
385 pdata[count] ^= sdata[count];
387 /* Free the malloced buffer */
388 if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */
389 Free(rqe->b.b_data); /* free it */
390 rqe->flags &= ~XFR_MALLOCED;
391 } else
392 panic("complete_raid5_write: malloc conflict");
394 if ((rqe->b.b_cmd == BUF_CMD_READ) /* this was a read */
395 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */
396 rqe->b.b_cmd = BUF_CMD_WRITE; /* we're writing now */
397 rqe->b.b_bio1.bio_done = complete_rqe; /* by calling us here */
398 rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */
399 rqe->b.b_data = &ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
400 rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
401 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
402 rqe->b.b_bio1.bio_offset += (off_t)rqe->dataoffset << DEV_BSHIFT; /* point to the correct block */
403 drive = &DRIVE[rqe->driveno]; /* drive to access */
404 rqe->b.b_bio1.bio_driver_info = drive->dev;
405 rqg->active++; /* another active request */
407 /* We can't sleep here, so we just increment the counters. */
408 drive->active++;
409 if (drive->active >= drive->maxactive)
410 drive->maxactive = drive->active;
411 vinum_conf.active++;
412 if (vinum_conf.active >= vinum_conf.maxactive)
413 vinum_conf.maxactive = vinum_conf.active;
414 #ifdef VINUMDEBUG
415 if (debug & DEBUG_ADDRESSES)
416 log(LOG_DEBUG,
417 " %s dev %s, sd %d, offset 0x%jx, devoffset 0x%jx, length %d\n",
418 (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write",
419 drive->devicename,
420 rqe->sdno,
421 (uintmax_t)(rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT)),
422 (uintmax_t)rqe->b.b_bio1.bio_offset,
423 rqe->b.b_bcount);
424 if (debug & DEBUG_LASTREQS)
425 logrq(loginfo_raid5_data, (union rqinfou) rqe, ubio);
426 #endif
427 vn_strategy(drive->vp, &rqe->b.b_bio1);
432 /* Finally, write the parity block */
433 rqe = &rqg->rqe[0];
434 rqe->b.b_cmd = BUF_CMD_WRITE; /* we're writing now */
435 rqe->b.b_bio1.bio_done = complete_rqe; /* by calling us here */
436 rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */
437 rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */
438 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */
439 drive = &DRIVE[rqe->driveno]; /* drive to access */
440 rqe->b.b_bio1.bio_driver_info = drive->dev;
441 rqg->active++; /* another active request */
443 /* We can't sleep here, so we just increment the counters. */
444 drive->active++;
445 if (drive->active >= drive->maxactive)
446 drive->maxactive = drive->active;
447 vinum_conf.active++;
448 if (vinum_conf.active >= vinum_conf.maxactive)
449 vinum_conf.maxactive = vinum_conf.active;
451 #ifdef VINUMDEBUG
452 if (debug & DEBUG_ADDRESSES)
453 log(LOG_DEBUG,
454 " %s dev %s, sd %d, offset 0x%jx, devoffset 0x%jx, length %d\n",
455 (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write",
456 drive->devicename,
457 rqe->sdno,
458 (uintmax_t)(rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT)),
459 (uintmax_t)rqe->b.b_bio1.bio_offset,
460 rqe->b.b_bcount);
461 if (debug & DEBUG_LASTREQS)
462 logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubio);
463 #endif
464 vn_strategy(drive->vp, &rqe->b.b_bio1);