1 /* vinuminterrupt.c: bottom half of the driver */
4 * Copyright (c) 1997, 1998, 1999
5 * Nan Yang Computer Services Limited. All rights reserved.
7 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
9 * Written by Greg Lehey
11 * This software is distributed under the so-called ``Berkeley
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by Nan Yang Computer
26 * 4. Neither the name of the Company nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
30 * This software is provided ``as is'', and any express or implied
31 * warranties, including, but not limited to, the implied warranties of
32 * merchantability and fitness for a particular purpose are disclaimed.
33 * In no event shall the company or contributors be liable for any
34 * direct, indirect, incidental, special, exemplary, or consequential
35 * damages (including, but not limited to, procurement of substitute
36 * goods or services; loss of use, data, or profits; or business
37 * interruption) however caused and on any theory of liability, whether
38 * in contract, strict liability, or tort (including negligence or
39 * otherwise) arising in any way out of the use of this software, even if
40 * advised of the possibility of such damage.
42 * $Id: vinuminterrupt.c,v 1.12 2000/11/24 03:41:42 grog Exp grog $
43 * $FreeBSD: src/sys/dev/vinum/vinuminterrupt.c,v 1.25.2.3 2001/05/28 05:56:27 grog Exp $
44 * $DragonFly: src/sys/dev/raid/vinum/vinuminterrupt.c,v 1.13 2007/08/01 11:46:46 swildner Exp $
49 #include <sys/resourcevar.h>
51 void complete_raid5_write(struct rqelement
*);
52 void complete_rqe(struct bio
*bio
);
53 void sdio_done(struct bio
*bio
);
56 * Take a completed buffer, transfer the data back if
57 * it's a read, and complete the high-level request
58 * if this is the last subrequest.
60 * The bp parameter is in fact a struct rqelement, which
61 * includes a couple of extras at the end.
64 complete_rqe(struct bio
*bio
)
66 struct buf
*bp
= bio
->bio_buf
;
67 struct rqelement
*rqe
;
70 struct bio
*ubio
; /* user buffer */
73 char *gravity
; /* for error messages */
75 rqe
= (struct rqelement
*) bp
; /* point to the element that completed */
76 rqg
= rqe
->rqg
; /* and the request group */
77 rq
= rqg
->rq
; /* and the complete request */
78 ubio
= rq
->bio
; /* user buffer */
81 if (debug
& DEBUG_LASTREQS
)
82 logrq(loginfo_iodone
, (union rqinfou
) rqe
, ubio
);
84 drive
= &DRIVE
[rqe
->driveno
];
85 drive
->active
--; /* one less outstanding I/O on this drive */
86 vinum_conf
.active
--; /* one less outstanding I/O globally */
87 if ((drive
->active
== (DRIVE_MAXACTIVE
- 1)) /* we were at the drive limit */
88 ||(vinum_conf
.active
== VINUM_MAXACTIVE
)) /* or the global limit */
89 wakeup(&launch_requests
); /* let another one at it */
90 if ((bp
->b_flags
& B_ERROR
) != 0) { /* transfer in error */
94 if (bp
->b_error
!= 0) /* did it return a number? */
95 rq
->error
= bp
->b_error
; /* yes, put it in. */
96 else if (rq
->error
== 0) /* no: do we have one already? */
97 rq
->error
= EIO
; /* no: catchall "I/O error" */
98 sd
->lasterror
= rq
->error
;
99 if (bp
->b_cmd
== BUF_CMD_READ
) {
100 if ((rq
->error
== ENXIO
) || (sd
->flags
& VF_RETRYERRORS
) == 0) {
102 set_sd_state(rqe
->sdno
, sd_crashed
, setstate_force
); /* subdisk is crashed */
105 "%s:%s read error, offset %lld for %d bytes\n",
108 (long long)bio
->bio_offset
,
110 } else { /* write operation */
111 if ((rq
->error
== ENXIO
) || (sd
->flags
& VF_RETRYERRORS
) == 0) {
113 set_sd_state(rqe
->sdno
, sd_stale
, setstate_force
); /* subdisk is stale */
116 "%s:%s write error, offset %lld for %d bytes\n",
119 (long long)bio
->bio_offset
,
123 "%s: user buffer offset %lld for %d bytes\n",
125 (long long)ubio
->bio_offset
,
126 ubio
->bio_buf
->b_bcount
);
127 if (rq
->error
== ENXIO
) { /* the drive's down too */
129 "%s: fatal drive I/O error, offset %lld for %d bytes\n",
130 DRIVE
[rqe
->driveno
].label
.name
,
131 (long long)bio
->bio_offset
,
133 DRIVE
[rqe
->driveno
].lasterror
= rq
->error
;
134 set_drive_state(rqe
->driveno
, /* take the drive down */
139 /* Now update the statistics */
140 if (bp
->b_cmd
== BUF_CMD_READ
) { /* read operation */
141 DRIVE
[rqe
->driveno
].reads
++;
142 DRIVE
[rqe
->driveno
].bytes_read
+= bp
->b_bcount
;
143 SD
[rqe
->sdno
].reads
++;
144 SD
[rqe
->sdno
].bytes_read
+= bp
->b_bcount
;
145 PLEX
[rqe
->rqg
->plexno
].reads
++;
146 PLEX
[rqe
->rqg
->plexno
].bytes_read
+= bp
->b_bcount
;
147 if (PLEX
[rqe
->rqg
->plexno
].volno
>= 0) { /* volume I/O, not plex */
148 VOL
[PLEX
[rqe
->rqg
->plexno
].volno
].reads
++;
149 VOL
[PLEX
[rqe
->rqg
->plexno
].volno
].bytes_read
+= bp
->b_bcount
;
151 } else { /* write operation */
152 DRIVE
[rqe
->driveno
].writes
++;
153 DRIVE
[rqe
->driveno
].bytes_written
+= bp
->b_bcount
;
154 SD
[rqe
->sdno
].writes
++;
155 SD
[rqe
->sdno
].bytes_written
+= bp
->b_bcount
;
156 PLEX
[rqe
->rqg
->plexno
].writes
++;
157 PLEX
[rqe
->rqg
->plexno
].bytes_written
+= bp
->b_bcount
;
158 if (PLEX
[rqe
->rqg
->plexno
].volno
>= 0) { /* volume I/O, not plex */
159 VOL
[PLEX
[rqe
->rqg
->plexno
].volno
].writes
++;
160 VOL
[PLEX
[rqe
->rqg
->plexno
].volno
].bytes_written
+= bp
->b_bcount
;
163 if (rqg
->flags
& XFR_RECOVERY_READ
) { /* recovery read, */
164 int *sdata
; /* source */
165 int *data
; /* and group data */
166 int length
; /* and count involved */
167 int count
; /* loop counter */
168 struct rqelement
*urqe
= &rqg
->rqe
[rqg
->badsdno
]; /* rqe of the bad subdisk */
170 /* XOR destination is the user data */
171 sdata
= (int *) &rqe
->b
.b_data
[rqe
->groupoffset
<< DEV_BSHIFT
]; /* old data contents */
172 data
= (int *) &urqe
->b
.b_data
[urqe
->groupoffset
<< DEV_BSHIFT
]; /* destination */
173 length
= urqe
->grouplen
* (DEV_BSIZE
/ sizeof(int)); /* and number of ints */
175 for (count
= 0; count
< length
; count
++)
176 data
[count
] ^= sdata
[count
];
179 * In a normal read, we will normally read directly
180 * into the user buffer. This doesn't work if
181 * we're also doing a recovery, so we have to
184 if (rqe
->flags
& XFR_NORMAL_READ
) { /* normal read as well, */
185 char *src
= &rqe
->b
.b_data
[rqe
->dataoffset
<< DEV_BSHIFT
]; /* read data is here */
188 dst
= (char *) ubio
->bio_buf
->b_data
+ (rqe
->useroffset
<< DEV_BSHIFT
); /* where to put it in user buffer */
189 length
= rqe
->datalen
<< DEV_BSHIFT
; /* and count involved */
190 bcopy(src
, dst
, length
); /* move it */
192 } else if ((rqg
->flags
& (XFR_NORMAL_WRITE
| XFR_DEGRADED_WRITE
)) /* RAID 4/5 group write operation */
193 &&(rqg
->active
== 1)) /* and this is the last active request */
194 complete_raid5_write(rqe
);
196 * This is the earliest place where we can be
197 * sure that the request has really finished,
198 * since complete_raid5_write can issue new
201 rqg
->active
--; /* this request now finished */
202 if (rqg
->active
== 0) { /* request group finished, */
203 rq
->active
--; /* one less */
204 if (rqg
->lock
) { /* got a lock? */
205 unlockrange(rqg
->plexno
, rqg
->lock
); /* yes, free it */
209 if (rq
->active
== 0) { /* request finished, */
211 if (debug
& DEBUG_RESID
) {
212 if (ubio
->bio_buf
->b_resid
!= 0) /* still something to transfer? */
217 if (rq
->error
) { /* did we have an error? */
218 if (rq
->isplex
) { /* plex operation, */
219 ubio
->bio_buf
->b_flags
|= B_ERROR
; /* yes, propagate to user */
220 ubio
->bio_buf
->b_error
= rq
->error
;
221 } else /* try to recover */
222 queue_daemon_request(daemonrq_ioerror
, (union daemoninfo
) rq
); /* let the daemon complete */
224 ubio
->bio_buf
->b_resid
= 0; /* completed our transfer */
225 if (rq
->isplex
== 0) /* volume request, */
226 VOL
[rq
->volplex
.volno
].active
--; /* another request finished */
227 biodone(ubio
); /* top level buffer completed */
228 freerq(rq
); /* return the request storage */
233 /* Free a request block and anything hanging off it */
235 freerq(struct request
*rq
)
238 struct rqgroup
*nrqg
; /* next in chain */
241 for (rqg
= rq
->rqg
; rqg
!= NULL
; rqg
= nrqg
) { /* through the whole request chain */
242 if (rqg
->lock
) /* got a lock? */
243 unlockrange(rqg
->plexno
, rqg
->lock
); /* yes, free it */
244 for (rqno
= 0; rqno
< rqg
->count
; rqno
++) {
245 if ((rqg
->rqe
[rqno
].flags
& XFR_MALLOCED
) /* data buffer was malloced, */
246 &&rqg
->rqe
[rqno
].b
.b_data
) /* and the allocation succeeded */
247 Free(rqg
->rqe
[rqno
].b
.b_data
); /* free it */
248 if (rqg
->rqe
[rqno
].flags
& XFR_BUFLOCKED
) { /* locked this buffer, */
249 BUF_UNLOCK(&rqg
->rqe
[rqno
].b
); /* unlock it again */
250 BUF_LOCKFREE(&rqg
->rqe
[rqno
].b
);
253 nrqg
= rqg
->next
; /* note the next one */
254 Free(rqg
); /* and free this one */
256 Free(rq
); /* free the request itself */
259 /* I/O on subdisk completed */
261 sdio_done(struct bio
*bio
)
265 sbp
= (struct sdbuf
*) bio
->bio_buf
;
266 if (sbp
->b
.b_flags
& B_ERROR
) { /* had an error */
267 sbp
->bio
->bio_buf
->b_flags
|= B_ERROR
; /* propagate upwards */
268 sbp
->bio
->bio_buf
->b_error
= sbp
->b
.b_error
;
271 if (debug
& DEBUG_LASTREQS
)
272 logrq(loginfo_sdiodone
, (union rqinfou
)bio
, bio
);
274 sbp
->bio
->bio_buf
->b_resid
= sbp
->b
.b_resid
; /* copy the resid field */
275 /* Now update the statistics */
276 if (sbp
->b
.b_cmd
== BUF_CMD_READ
) { /* read operation */
277 DRIVE
[sbp
->driveno
].reads
++;
278 DRIVE
[sbp
->driveno
].bytes_read
+= sbp
->b
.b_bcount
;
279 SD
[sbp
->sdno
].reads
++;
280 SD
[sbp
->sdno
].bytes_read
+= sbp
->b
.b_bcount
;
281 } else { /* write operation */
282 DRIVE
[sbp
->driveno
].writes
++;
283 DRIVE
[sbp
->driveno
].bytes_written
+= sbp
->b
.b_bcount
;
284 SD
[sbp
->sdno
].writes
++;
285 SD
[sbp
->sdno
].bytes_written
+= sbp
->b
.b_bcount
;
288 biodone(sbp
->bio
); /* complete the caller's I/O */
290 BUF_LOCKFREE(&sbp
->b
);
294 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */
296 complete_raid5_write(struct rqelement
*rqe
)
298 int *sdata
; /* source */
299 int *pdata
; /* and parity block data */
300 int length
; /* and count involved */
301 int count
; /* loop counter */
302 int rqno
; /* request index */
303 int rqoffset
; /* offset of request data from parity data */
304 struct bio
*ubio
; /* user buffer header */
305 struct request
*rq
; /* pointer to our request */
306 struct rqgroup
*rqg
; /* and to the request group */
307 struct rqelement
*prqe
; /* point to the parity block */
308 struct drive
*drive
; /* drive to access */
309 rqg
= rqe
->rqg
; /* and to our request group */
310 rq
= rqg
->rq
; /* point to our request */
311 ubio
= rq
->bio
; /* user's buffer header */
312 prqe
= &rqg
->rqe
[0]; /* point to the parity block */
315 * If we get to this function, we have normal or
316 * degraded writes, or a combination of both. We do
317 * the same thing in each case: we perform an
318 * exclusive or to the parity block. The only
319 * difference is the origin of the data and the
322 if (rqe
->flags
& XFR_DEGRADED_WRITE
) { /* do the degraded write stuff */
323 pdata
= (int *) (&prqe
->b
.b_data
[(prqe
->groupoffset
) << DEV_BSHIFT
]); /* parity data pointer */
324 bzero(pdata
, prqe
->grouplen
<< DEV_BSHIFT
); /* start with nothing in the parity block */
326 /* Now get what data we need from each block */
327 for (rqno
= 1; rqno
< rqg
->count
; rqno
++) { /* for all the data blocks */
328 rqe
= &rqg
->rqe
[rqno
]; /* this request */
329 sdata
= (int *) (&rqe
->b
.b_data
[rqe
->groupoffset
<< DEV_BSHIFT
]); /* old data */
330 length
= rqe
->grouplen
<< (DEV_BSHIFT
- 2); /* and count involved */
333 * Add the data block to the parity block. Before
334 * we started the request, we zeroed the parity
335 * block, so the result of adding all the other
336 * blocks and the block we want to write will be
337 * the correct parity block.
339 for (count
= 0; count
< length
; count
++)
340 pdata
[count
] ^= sdata
[count
];
341 if ((rqe
->flags
& XFR_MALLOCED
) /* the buffer was malloced, */
342 &&((rqg
->flags
& XFR_NORMAL_WRITE
) == 0)) { /* and we have no normal write, */
343 Free(rqe
->b
.b_data
); /* free it now */
344 rqe
->flags
&= ~XFR_MALLOCED
;
348 if (rqg
->flags
& XFR_NORMAL_WRITE
) { /* do normal write stuff */
349 /* Get what data we need from each block */
350 for (rqno
= 1; rqno
< rqg
->count
; rqno
++) { /* for all the data blocks */
351 rqe
= &rqg
->rqe
[rqno
]; /* this request */
352 if ((rqe
->flags
& (XFR_DATA_BLOCK
| XFR_BAD_SUBDISK
| XFR_NORMAL_WRITE
))
353 == (XFR_DATA_BLOCK
| XFR_NORMAL_WRITE
)) { /* good data block to write */
354 sdata
= (int *) &rqe
->b
.b_data
[rqe
->dataoffset
<< DEV_BSHIFT
]; /* old data contents */
355 rqoffset
= rqe
->dataoffset
+ rqe
->sdoffset
- prqe
->sdoffset
; /* corresponding parity block offset */
356 pdata
= (int *) (&prqe
->b
.b_data
[rqoffset
<< DEV_BSHIFT
]); /* parity data pointer */
357 length
= rqe
->datalen
* (DEV_BSIZE
/ sizeof(int)); /* and number of ints */
360 * "remove" the old data block
361 * from the parity block
363 if ((pdata
< ((int *) prqe
->b
.b_data
))
364 || (&pdata
[length
] > ((int *) (prqe
->b
.b_data
+ prqe
->b
.b_bcount
)))
365 || (sdata
< ((int *) rqe
->b
.b_data
))
366 || (&sdata
[length
] > ((int *) (rqe
->b
.b_data
+ rqe
->b
.b_bcount
))))
367 panic("complete_raid5_write: bounds overflow");
368 for (count
= 0; count
< length
; count
++)
369 pdata
[count
] ^= sdata
[count
];
371 /* "add" the new data block */
372 sdata
= (int *) (&ubio
->bio_buf
->b_data
[rqe
->useroffset
<< DEV_BSHIFT
]); /* new data */
373 if ((sdata
< ((int *) ubio
->bio_buf
->b_data
))
374 || (&sdata
[length
] > ((int *) (ubio
->bio_buf
->b_data
+ ubio
->bio_buf
->b_bcount
))))
375 panic("complete_raid5_write: bounds overflow");
376 for (count
= 0; count
< length
; count
++)
377 pdata
[count
] ^= sdata
[count
];
379 /* Free the malloced buffer */
380 if (rqe
->flags
& XFR_MALLOCED
) { /* the buffer was malloced, */
381 Free(rqe
->b
.b_data
); /* free it */
382 rqe
->flags
&= ~XFR_MALLOCED
;
384 panic("complete_raid5_write: malloc conflict");
386 if ((rqe
->b
.b_cmd
== BUF_CMD_READ
) /* this was a read */
387 &&((rqe
->flags
& XFR_BAD_SUBDISK
) == 0)) { /* and we can write this block */
388 rqe
->b
.b_cmd
= BUF_CMD_WRITE
; /* we're writing now */
389 rqe
->b
.b_bio1
.bio_done
= complete_rqe
; /* by calling us here */
390 rqe
->flags
&= ~XFR_PARITYOP
; /* reset flags that brought us here */
391 rqe
->b
.b_data
= &ubio
->bio_buf
->b_data
[rqe
->useroffset
<< DEV_BSHIFT
]; /* point to the user data */
392 rqe
->b
.b_bcount
= rqe
->datalen
<< DEV_BSHIFT
; /* length to write */
393 rqe
->b
.b_resid
= rqe
->b
.b_bcount
; /* nothing transferred */
394 rqe
->b
.b_bio1
.bio_offset
+= (off_t
)rqe
->dataoffset
<< DEV_BSHIFT
; /* point to the correct block */
395 drive
= &DRIVE
[rqe
->driveno
]; /* drive to access */
396 rqe
->b
.b_bio1
.bio_driver_info
= drive
->dev
;
397 rqg
->active
++; /* another active request */
399 /* We can't sleep here, so we just increment the counters. */
401 if (drive
->active
>= drive
->maxactive
)
402 drive
->maxactive
= drive
->active
;
404 if (vinum_conf
.active
>= vinum_conf
.maxactive
)
405 vinum_conf
.maxactive
= vinum_conf
.active
;
407 if (debug
& DEBUG_ADDRESSES
)
409 " %s dev %s, sd %d, offset 0x%llx, devoffset 0x%llx, length %d\n",
410 (rqe
->b
.b_cmd
== BUF_CMD_READ
) ? "Read" : "Write",
413 rqe
->b
.b_bio1
.bio_offset
- ((off_t
)SD
[rqe
->sdno
].driveoffset
<< DEV_BSHIFT
),
414 rqe
->b
.b_bio1
.bio_offset
,
416 if (debug
& DEBUG_LASTREQS
)
417 logrq(loginfo_raid5_data
, (union rqinfou
) rqe
, ubio
);
419 vn_strategy(drive
->vp
, &rqe
->b
.b_bio1
);
424 /* Finally, write the parity block */
426 rqe
->b
.b_cmd
= BUF_CMD_WRITE
; /* we're writing now */
427 rqe
->b
.b_bio1
.bio_done
= complete_rqe
; /* by calling us here */
428 rqg
->flags
&= ~XFR_PARITYOP
; /* reset flags that brought us here */
429 rqe
->b
.b_bcount
= rqe
->buflen
<< DEV_BSHIFT
; /* length to write */
430 rqe
->b
.b_resid
= rqe
->b
.b_bcount
; /* nothing transferred */
431 drive
= &DRIVE
[rqe
->driveno
]; /* drive to access */
432 rqe
->b
.b_bio1
.bio_driver_info
= drive
->dev
;
433 rqg
->active
++; /* another active request */
435 /* We can't sleep here, so we just increment the counters. */
437 if (drive
->active
>= drive
->maxactive
)
438 drive
->maxactive
= drive
->active
;
440 if (vinum_conf
.active
>= vinum_conf
.maxactive
)
441 vinum_conf
.maxactive
= vinum_conf
.active
;
444 if (debug
& DEBUG_ADDRESSES
)
446 " %s dev %s, sd %d, offset 0x%llx, devoffset 0x%llx, length %d\n",
447 (rqe
->b
.b_cmd
== BUF_CMD_READ
) ? "Read" : "Write",
450 rqe
->b
.b_bio1
.bio_offset
- ((off_t
)SD
[rqe
->sdno
].driveoffset
<< DEV_BSHIFT
),
451 rqe
->b
.b_bio1
.bio_offset
,
453 if (debug
& DEBUG_LASTREQS
)
454 logrq(loginfo_raid5_parity
, (union rqinfou
) rqe
, ubio
);
456 vn_strategy(drive
->vp
, &rqe
->b
.b_bio1
);
459 /* Local Variables: */
460 /* fill-column: 50 */