2 * Copyright (c) 1997, 1998, 1999
3 * Nan Yang Computer Services Limited. All rights reserved.
5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
7 * Written by Greg Lehey
9 * This software is distributed under the so-called ``Berkeley
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by Nan Yang Computer
24 * 4. Neither the name of the Company nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * This software is provided ``as is'', and any express or implied
29 * warranties, including, but not limited to, the implied warranties of
30 * merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall the company or contributors be liable for any
32 * direct, indirect, incidental, special, exemplary, or consequential
33 * damages (including, but not limited to, procurement of substitute
34 * goods or services; loss of use, data, or profits; or business
35 * interruption) however caused and on any theory of liability, whether
36 * in contract, strict liability, or tort (including negligence or
37 * otherwise) arising in any way out of the use of this software, even if
38 * advised of the possibility of such damage.
40 * $Id: vinumrequest.c,v 1.30 2001/01/09 04:20:55 grog Exp grog $
41 * $FreeBSD: src/sys/dev/vinum/vinumrequest.c,v 1.44.2.5 2002/08/28 04:30:56 grog Exp $
42 * $DragonFly: src/sys/dev/raid/vinum/vinumrequest.c,v 1.19 2007/06/07 22:58:38 corecode Exp $
47 #include <sys/resourcevar.h>
49 enum requeststatus
bre(struct request
*rq
,
53 enum requeststatus
bre5(struct request
*rq
,
57 enum requeststatus
build_read_request(struct request
*rq
, int volplexno
);
58 enum requeststatus
build_write_request(struct request
*rq
);
59 enum requeststatus
build_rq_buffer(struct rqelement
*rqe
, struct plex
*plex
);
60 int find_alternate_sd(struct request
*rq
);
61 int check_range_covered(struct request
*);
62 void complete_rqe(struct bio
*bio
);
63 void complete_raid5_write(struct rqelement
*);
64 int abortrequest(struct request
*rq
, int error
);
65 void sdio_done(struct bio
*bio
);
66 struct bio
*vinum_bounds_check(struct bio
*bio
, struct volume
*vol
);
67 caddr_t
allocdatabuf(struct rqelement
*rqe
);
68 void freedatabuf(struct rqelement
*rqe
);
71 struct rqinfo rqinfo
[RQINFO_SIZE
];
72 struct rqinfo
*rqip
= rqinfo
;
75 logrq(enum rqinfo_type type
, union rqinfou info
, struct bio
*ubio
)
81 microtime(&rqip
->timestamp
); /* when did this happen? */
83 rqip
->bio
= ubio
; /* user buffer */
87 case loginfo_user_bpl
:
88 case loginfo_sdio
: /* subdisk I/O */
89 case loginfo_sdiol
: /* subdisk I/O launch */
90 case loginfo_sdiodone
: /* subdisk I/O complete */
91 bcopy(info
.bio
, &rqip
->info
.bio
, sizeof(struct bio
));
92 dev
= info
.bio
->bio_driver_info
;
93 rqip
->devmajor
= major(dev
);
94 rqip
->devminor
= minor(dev
);
99 case loginfo_raid5_data
:
100 case loginfo_raid5_parity
:
101 bcopy(info
.rqe
, &rqip
->info
.rqe
, sizeof(struct rqelement
));
102 dev
= info
.rqe
->b
.b_bio1
.bio_driver_info
;
103 rqip
->devmajor
= major(dev
);
104 rqip
->devminor
= minor(dev
);
107 case loginfo_lockwait
:
110 bcopy(info
.lockinfo
, &rqip
->info
.lockinfo
, sizeof(struct rangelock
));
118 if (rqip
>= &rqinfo
[RQINFO_SIZE
]) /* wrap around */
126 vinumstrategy(struct dev_strategy_args
*ap
)
128 cdev_t dev
= ap
->a_head
.a_dev
;
129 struct bio
*bio
= ap
->a_bio
;
130 struct buf
*bp
= bio
->bio_buf
;
131 struct bio
*nbio
= bio
;
132 struct volume
*vol
= NULL
;
135 switch (DEVTYPE(dev
)) {
137 case VINUM_RAWSD_TYPE
:
138 bio
->bio_driver_info
= dev
;
141 case VINUM_DRIVE_TYPE
:
144 * In fact, vinum doesn't handle drives: they're
145 * handled directly by the disk drivers
147 bp
->b_error
= EIO
; /* I/O error */
148 bp
->b_flags
|= B_ERROR
;
152 case VINUM_VOLUME_TYPE
: /* volume I/O */
155 if (vol
->state
!= volume_up
) { /* can't access this volume */
156 bp
->b_error
= EIO
; /* I/O error */
157 bp
->b_flags
|= B_ERROR
;
161 nbio
= vinum_bounds_check(bio
, vol
);
167 case VINUM_PLEX_TYPE
:
168 case VINUM_RAWPLEX_TYPE
:
170 * Plex I/O is pretty much the same as volume I/O
171 * for a single plex. Indicate this by passing a NULL
172 * pointer (set above) for the volume
174 bp
->b_resid
= bp
->b_bcount
; /* transfer everything */
175 vinumstart(dev
, nbio
, 0);
182 * Start a transfer. Return -1 on error,
183 * 0 if OK, 1 if we need to retry.
184 * Parameter reviveok is set when doing
185 * transfers for revives: it allows transfers to
186 * be started immediately when a revive is in
187 * progress. During revive, normal transfers
188 * are queued if they share address space with
189 * a currently active revive operation.
192 vinumstart(cdev_t dev
, struct bio
*bio
, int reviveok
)
194 struct buf
*bp
= bio
->bio_buf
;
196 int maxplex
; /* maximum number of plexes to handle */
198 struct request
*rq
; /* build up our request here */
199 enum requeststatus status
;
201 bio
->bio_driver_info
= dev
;
204 if (debug
& DEBUG_LASTREQS
)
205 logrq(loginfo_user_bp
, (union rqinfou
) bio
, bio
);
208 if ((bp
->b_bcount
% DEV_BSIZE
) != 0) { /* bad length */
209 bp
->b_error
= EINVAL
; /* invalid size */
210 bp
->b_flags
|= B_ERROR
;
214 rq
= (struct request
*) Malloc(sizeof(struct request
)); /* allocate a request struct */
215 if (rq
== NULL
) { /* can't do it */
216 bp
->b_error
= ENOMEM
; /* can't get memory */
217 bp
->b_flags
|= B_ERROR
;
221 bzero(rq
, sizeof(struct request
));
224 * Note the volume ID. This can be NULL, which
225 * the request building functions use as an
226 * indication for single plex I/O
228 rq
->bio
= bio
; /* and the user buffer struct */
230 if (DEVTYPE(dev
) == VINUM_VOLUME_TYPE
) { /* it's a volume, */
231 rq
->volplex
.volno
= Volno(dev
); /* get the volume number */
232 vol
= &VOL
[rq
->volplex
.volno
]; /* and point to it */
233 vol
->active
++; /* one more active request */
234 maxplex
= vol
->plexes
; /* consider all its plexes */
236 vol
= NULL
; /* no volume */
237 rq
->volplex
.plexno
= Plexno(dev
); /* point to the plex */
238 rq
->isplex
= 1; /* note that it's a plex */
239 maxplex
= 1; /* just the one plex */
242 if (bp
->b_cmd
== BUF_CMD_READ
) {
244 * This is a read request. Decide
245 * which plex to read from.
247 * There's a potential race condition here,
248 * since we're not locked, and we could end
249 * up multiply incrementing the round-robin
250 * counter. This doesn't have any serious
254 plexno
= vol
->preferred_plex
; /* get the plex to use */
255 if (plexno
< 0) { /* round robin */
256 plexno
= vol
->last_plex_read
;
257 vol
->last_plex_read
++;
258 if (vol
->last_plex_read
>= vol
->plexes
) /* got the the end? */
259 vol
->last_plex_read
= 0; /* wrap around */
261 status
= build_read_request(rq
, plexno
); /* build a request */
263 daddr_t diskaddr
= (daddr_t
)(bio
->bio_offset
>> DEV_BSHIFT
);
264 /* start offset of transfer */
265 status
= bre(rq
, /* build a request list */
268 diskaddr
+ (bp
->b_bcount
/ DEV_BSIZE
));
271 if (status
> REQUEST_RECOVERED
) { /* can't satisfy it */
272 if (status
== REQUEST_DOWN
) { /* not enough subdisks */
273 bp
->b_error
= EIO
; /* I/O error */
274 bp
->b_flags
|= B_ERROR
;
280 return launch_requests(rq
, reviveok
); /* now start the requests if we can */
283 * This is a write operation. We write to all plexes. If this is
284 * a RAID-4 or RAID-5 plex, we must also update the parity stripe.
288 status
= build_write_request(rq
); /* Not all the subdisks are up */
289 else { /* plex I/O */
293 diskstart
= (daddr_t
)(bio
->bio_offset
>> DEV_BSHIFT
); /* start offset of transfer */
294 diskend
= diskstart
+ bp
->b_bcount
/ DEV_BSIZE
;
295 status
= bre(rq
, Plexno(dev
),
296 &diskstart
, diskend
); /* build requests for the plex */
298 if (status
> REQUEST_RECOVERED
) { /* can't satisfy it */
299 if (status
== REQUEST_DOWN
) { /* not enough subdisks */
300 bp
->b_error
= EIO
; /* I/O error */
301 bp
->b_flags
|= B_ERROR
;
307 return launch_requests(rq
, reviveok
); /* now start the requests if we can */
312 * Call the low-level strategy routines to
313 * perform the requests in a struct request
316 launch_requests(struct request
*rq
, int reviveok
)
319 int rqno
; /* loop index */
320 struct rqelement
*rqe
; /* current element */
322 int rcount
; /* request count */
325 * First find out whether we're reviving, and the
326 * request contains a conflict. If so, we hang
327 * the request off plex->waitlist of the first
328 * plex we find which is reviving
331 if ((rq
->flags
& XFR_REVIVECONFLICT
) /* possible revive conflict */
332 &&(!reviveok
)) { /* and we don't want to do it now, */
334 struct request
*waitlist
; /* point to the waitlist */
337 if (sd
->waitlist
!= NULL
) { /* something there already, */
338 waitlist
= sd
->waitlist
;
339 while (waitlist
->next
!= NULL
) /* find the end */
340 waitlist
= waitlist
->next
;
341 waitlist
->next
= rq
; /* hook our request there */
343 sd
->waitlist
= rq
; /* hook our request at the front */
346 if (debug
& DEBUG_REVIVECONFLICT
) {
348 "Revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%llx, length %d\n",
351 (rq
->bio
->bio_buf
->b_cmd
& BUF_CMD_READ
) ? "Read" : "Write",
352 major(((cdev_t
)rq
->bio
->bio_driver_info
)),
353 minor(((cdev_t
)rq
->bio
->bio_driver_info
)),
355 rq
->bio
->bio_buf
->b_bcount
);
358 return 0; /* and get out of here */
360 rq
->active
= 0; /* nothing yet */
362 if (debug
& DEBUG_ADDRESSES
)
364 "Request: %p\n%s dev %d.%d, offset 0x%llx, length %d\n",
366 (rq
->bio
->bio_buf
->b_cmd
== BUF_CMD_READ
) ? "Read" : "Write",
367 major(((cdev_t
)rq
->bio
->bio_driver_info
)),
368 minor(((cdev_t
)rq
->bio
->bio_driver_info
)),
370 rq
->bio
->bio_buf
->b_bcount
);
371 vinum_conf
.lastrq
= rq
;
372 vinum_conf
.lastbio
= rq
->bio
;
373 if (debug
& DEBUG_LASTREQS
)
374 logrq(loginfo_user_bpl
, (union rqinfou
) rq
->bio
, rq
->bio
);
378 * This loop happens without any participation
379 * of the bottom half, so it requires no
382 for (rqg
= rq
->rqg
; rqg
!= NULL
; rqg
= rqg
->next
) { /* through the whole request chain */
383 rqg
->active
= rqg
->count
; /* they're all active */
384 for (rqno
= 0; rqno
< rqg
->count
; rqno
++) {
385 rqe
= &rqg
->rqe
[rqno
];
386 if (rqe
->flags
& XFR_BAD_SUBDISK
) /* this subdisk is bad, */
387 rqg
->active
--; /* one less active request */
389 if (rqg
->active
) /* we have at least one active request, */
390 rq
->active
++; /* one more active request group */
394 * Now fire off the requests. In this loop the
395 * bottom half could be completing requests
396 * before we finish, so we need critical section protection.
399 for (rqg
= rq
->rqg
; rqg
!= NULL
;) { /* through the whole request chain */
400 if (rqg
->lockbase
>= 0) /* this rqg needs a lock first */
401 rqg
->lock
= lockrange(rqg
->lockbase
, rqg
->rq
->bio
->bio_buf
, &PLEX
[rqg
->plexno
]);
403 for (rqno
= 0; rqno
< rcount
;) {
406 rqe
= &rqg
->rqe
[rqno
];
409 * Point to next rqg before the bottom end
410 * changes the structures.
412 if (++rqno
>= rcount
)
414 if ((rqe
->flags
& XFR_BAD_SUBDISK
) == 0) { /* this subdisk is good, */
415 drive
= &DRIVE
[rqe
->driveno
]; /* look at drive */
417 if (drive
->active
>= drive
->maxactive
)
418 drive
->maxactive
= drive
->active
;
420 if (vinum_conf
.active
>= vinum_conf
.maxactive
)
421 vinum_conf
.maxactive
= vinum_conf
.active
;
423 dev
= rqe
->b
.b_bio1
.bio_driver_info
;
425 if (debug
& DEBUG_ADDRESSES
)
427 " %s dev %d.%d, sd %d, offset 0x%llx, devoffset 0x%llx, length %d\n",
428 (rqe
->b
.b_cmd
== BUF_CMD_READ
) ? "Read" : "Write",
432 rqe
->b
.b_bio1
.bio_offset
- ((off_t
)SD
[rqe
->sdno
].driveoffset
<< DEV_BSHIFT
),
433 rqe
->b
.b_bio1
.bio_offset
,
435 if (debug
& DEBUG_LASTREQS
)
436 logrq(loginfo_rqe
, (union rqinfou
) rqe
, rq
->bio
);
438 /* fire off the request */
439 dev_dstrategy(dev
, &rqe
->b
.b_bio1
);
448 * define the low-level requests needed to perform a
449 * high-level I/O operation for a specific plex 'plexno'.
451 * Return REQUEST_OK if all subdisks involved in the request are up,
452 * REQUEST_DOWN if some subdisks are not up, and REQUEST_EOF if the
453 * request is at least partially outside the bounds of the subdisks.
455 * Modify the pointer *diskstart to point to the end address. On
456 * read, return on the first bad subdisk, so that the caller
457 * (build_read_request) can try alternatives.
459 * On entry to this routine, the rqg structures are not assigned. The
460 * assignment is performed by expandrq(). Strictly speaking, the
461 * elements rqe->sdno of all entries should be set to -1, since 0
462 * (from bzero) is a valid subdisk number. We avoid this problem by
463 * initializing the ones we use, and not looking at the others (index
467 bre(struct request
*rq
,
476 struct buf
*bp
; /* user's bp */
478 enum requeststatus status
; /* return value */
479 daddr_t plexoffset
; /* offset of transfer in plex */
480 daddr_t stripebase
; /* base address of stripe (1st subdisk) */
481 daddr_t stripeoffset
; /* offset in stripe */
482 daddr_t blockoffset
; /* offset in stripe on subdisk */
483 struct rqelement
*rqe
; /* point to this request information */
484 daddr_t diskstart
= *diskaddr
; /* remember where this transfer starts */
485 enum requeststatus s
; /* temp return value */
487 bio
= rq
->bio
; /* buffer pointer */
489 status
= REQUEST_OK
; /* return value: OK until proven otherwise */
490 plex
= &PLEX
[plexno
]; /* point to the plex */
492 switch (plex
->organization
) {
494 sd
= NULL
; /* (keep compiler quiet) */
495 for (sdno
= 0; sdno
< plex
->subdisks
; sdno
++) {
496 sd
= &SD
[plex
->sdnos
[sdno
]];
497 if (*diskaddr
< sd
->plexoffset
) /* we must have a hole, */
498 status
= REQUEST_DEGRADED
; /* note the fact */
499 if (*diskaddr
< (sd
->plexoffset
+ sd
->sectors
)) { /* the request starts in this subdisk */
500 rqg
= allocrqg(rq
, 1); /* space for the request */
501 if (rqg
== NULL
) { /* malloc failed */
502 bp
->b_error
= ENOMEM
;
503 bp
->b_flags
|= B_ERROR
;
504 return REQUEST_ENOMEM
;
506 rqg
->plexno
= plexno
;
508 rqe
= &rqg
->rqe
[0]; /* point to the element */
509 rqe
->rqg
= rqg
; /* group */
510 rqe
->sdno
= sd
->sdno
; /* put in the subdisk number */
511 plexoffset
= *diskaddr
; /* start offset in plex */
512 rqe
->sdoffset
= plexoffset
- sd
->plexoffset
; /* start offset in subdisk */
513 rqe
->useroffset
= plexoffset
- diskstart
; /* start offset in user buffer */
515 rqe
->datalen
= min(diskend
- *diskaddr
, /* number of sectors to transfer in this sd */
516 sd
->sectors
- rqe
->sdoffset
);
517 rqe
->groupoffset
= 0; /* no groups for concatenated plexes */
519 rqe
->buflen
= rqe
->datalen
; /* buffer length is data buffer length */
521 rqe
->driveno
= sd
->driveno
;
522 if (sd
->state
!= sd_up
) { /* *now* we find the sd is down */
523 s
= checksdstate(sd
, rq
, *diskaddr
, diskend
); /* do we need to change state? */
524 if (s
== REQUEST_DOWN
) { /* down? */
525 rqe
->flags
= XFR_BAD_SUBDISK
; /* yup */
526 if (rq
->bio
->bio_buf
->b_cmd
== BUF_CMD_READ
) /* read request, */
527 return REQUEST_DEGRADED
; /* give up here */
529 * If we're writing, don't give up
530 * because of a bad subdisk. Go
531 * through to the bitter end, but note
532 * which ones we can't access.
534 status
= REQUEST_DEGRADED
; /* can't do it all */
537 *diskaddr
+= rqe
->datalen
; /* bump the address */
538 if (build_rq_buffer(rqe
, plex
)) { /* build the buffer */
540 bp
->b_error
= ENOMEM
;
541 bp
->b_flags
|= B_ERROR
;
542 return REQUEST_ENOMEM
; /* can't do it */
545 if (*diskaddr
== diskend
) /* we're finished, */
546 break; /* get out of here */
549 * We've got to the end of the plex. Have we got to the end of
550 * the transfer? It would seem that having an offset beyond the
551 * end of the subdisk is an error, but in fact it can happen if
552 * the volume has another plex of different size. There's a valid
553 * question as to why you would want to do this, but currently
556 * In a previous version, I returned REQUEST_DOWN here. I think
557 * REQUEST_EOF is more appropriate now.
559 if (diskend
> sd
->sectors
+ sd
->plexoffset
) /* pointing beyond EOF? */
560 status
= REQUEST_EOF
;
565 while (*diskaddr
< diskend
) { /* until we get it all sorted out */
566 if (*diskaddr
>= plex
->length
) /* beyond the end of the plex */
567 return REQUEST_EOF
; /* can't continue */
569 /* The offset of the start address from the start of the stripe. */
570 stripeoffset
= *diskaddr
% (plex
->stripesize
* plex
->subdisks
);
572 /* The plex-relative address of the start of the stripe. */
573 stripebase
= *diskaddr
- stripeoffset
;
575 /* The number of the subdisk in which the start is located. */
576 sdno
= stripeoffset
/ plex
->stripesize
;
578 /* The offset from the beginning of the stripe on this subdisk. */
579 blockoffset
= stripeoffset
% plex
->stripesize
;
581 sd
= &SD
[plex
->sdnos
[sdno
]]; /* the subdisk in question */
582 rqg
= allocrqg(rq
, 1); /* space for the request */
583 if (rqg
== NULL
) { /* malloc failed */
584 bp
->b_error
= ENOMEM
;
585 bp
->b_flags
|= B_ERROR
;
586 return REQUEST_ENOMEM
;
588 rqg
->plexno
= plexno
;
590 rqe
= &rqg
->rqe
[0]; /* point to the element */
592 rqe
->sdoffset
= stripebase
/ plex
->subdisks
+ blockoffset
; /* start offset in this subdisk */
593 rqe
->useroffset
= *diskaddr
- diskstart
; /* The offset of the start in the user buffer */
595 rqe
->datalen
= min(diskend
- *diskaddr
, /* the amount remaining to transfer */
596 plex
->stripesize
- blockoffset
); /* and the amount left in this stripe */
597 rqe
->groupoffset
= 0; /* no groups for striped plexes */
599 rqe
->buflen
= rqe
->datalen
; /* buffer length is data buffer length */
601 rqe
->sdno
= sd
->sdno
; /* put in the subdisk number */
602 rqe
->driveno
= sd
->driveno
;
604 if (sd
->state
!= sd_up
) { /* *now* we find the sd is down */
605 s
= checksdstate(sd
, rq
, *diskaddr
, diskend
); /* do we need to change state? */
606 if (s
== REQUEST_DOWN
) { /* down? */
607 rqe
->flags
= XFR_BAD_SUBDISK
; /* yup */
608 if (rq
->bio
->bio_buf
->b_cmd
== BUF_CMD_READ
) /* read request, */
609 return REQUEST_DEGRADED
; /* give up here */
611 * If we're writing, don't give up
612 * because of a bad subdisk. Go through
613 * to the bitter end, but note which
614 * ones we can't access.
616 status
= REQUEST_DEGRADED
; /* can't do it all */
620 * It would seem that having an offset
621 * beyond the end of the subdisk is an
622 * error, but in fact it can happen if the
623 * volume has another plex of different
624 * size. There's a valid question as to why
625 * you would want to do this, but currently
628 if (rqe
->sdoffset
+ rqe
->datalen
> sd
->sectors
) { /* ends beyond the end of the subdisk? */
629 rqe
->datalen
= sd
->sectors
- rqe
->sdoffset
; /* truncate */
631 if (debug
& DEBUG_EOFINFO
) { /* tell on the request */
633 "vinum: EOF on plex %s, sd %s offset %llx (user offset %x)\n",
637 bp
->b_bio1
.bio_offset
);
639 "vinum: stripebase %x, stripeoffset %x, blockoffset %x\n",
646 if (build_rq_buffer(rqe
, plex
)) { /* build the buffer */
648 bp
->b_error
= ENOMEM
;
649 bp
->b_flags
|= B_ERROR
;
650 return REQUEST_ENOMEM
; /* can't do it */
652 *diskaddr
+= rqe
->datalen
; /* look at the remainder */
653 if ((*diskaddr
< diskend
) /* didn't finish the request on this stripe */
654 &&(*diskaddr
< plex
->length
)) { /* and there's more to come */
655 plex
->multiblock
++; /* count another one */
656 if (sdno
== plex
->subdisks
- 1) /* last subdisk, */
657 plex
->multistripe
++; /* another stripe as well */
664 * RAID-4 and RAID-5 are complicated enough to have their own
669 status
= bre5(rq
, plexno
, diskaddr
, diskend
);
673 log(LOG_ERR
, "vinum: invalid plex type %d in bre\n", plex
->organization
);
674 status
= REQUEST_DOWN
; /* can't access it */
681 * Build up a request structure for reading volumes.
682 * This function is not needed for plex reads, since there's
683 * no recovery if a plex read can't be satisified.
686 build_read_request(struct request
*rq
, /* request */
688 { /* index in the volume's plex table */
691 daddr_t startaddr
; /* offset of previous part of transfer */
692 daddr_t diskaddr
; /* offset of current part of transfer */
693 daddr_t diskend
; /* and end offset of transfer */
694 int plexno
; /* plex index in vinum_conf */
695 struct rqgroup
*rqg
; /* point to the request we're working on */
696 struct volume
*vol
; /* volume in question */
697 int recovered
= 0; /* set if we recover a read */
698 enum requeststatus status
= REQUEST_OK
;
699 int plexmask
; /* bit mask of plexes, for recovery */
701 bio
= rq
->bio
; /* buffer pointer */
703 diskaddr
= bio
->bio_offset
>> DEV_BSHIFT
; /* start offset of transfer */
704 diskend
= diskaddr
+ (bp
->b_bcount
/ DEV_BSIZE
); /* and end offset of transfer */
705 rqg
= &rq
->rqg
[plexindex
]; /* plex request */
706 vol
= &VOL
[rq
->volplex
.volno
]; /* point to volume */
708 while (diskaddr
< diskend
) { /* build up request components */
709 startaddr
= diskaddr
;
710 status
= bre(rq
, vol
->plex
[plexindex
], &diskaddr
, diskend
); /* build up a request */
715 case REQUEST_RECOVERED
:
717 * XXX FIXME if we have more than one plex, and we can
718 * satisfy the request from another, don't use the
719 * recovered request, since it's more expensive.
727 * If we get here, our request is not complete. Try
728 * to fill in the missing parts from another plex.
729 * This can happen multiple times in this function,
730 * and we reinitialize the plex mask each time, since
731 * we could have a hole in our plexes.
734 case REQUEST_DOWN
: /* can't access the plex */
735 case REQUEST_DEGRADED
: /* can't access the plex */
736 plexmask
= ((1 << vol
->plexes
) - 1) /* all plexes in the volume */
737 &~(1 << plexindex
); /* except for the one we were looking at */
738 for (plexno
= 0; plexno
< vol
->plexes
; plexno
++) {
739 if (plexmask
== 0) /* no plexes left to try */
740 return REQUEST_DOWN
; /* failed */
741 diskaddr
= startaddr
; /* start at the beginning again */
742 if (plexmask
& (1 << plexno
)) { /* we haven't tried this plex yet */
743 bre(rq
, vol
->plex
[plexno
], &diskaddr
, diskend
); /* try a request */
744 if (diskaddr
> startaddr
) { /* we satisfied another part */
745 recovered
= 1; /* we recovered from the problem */
746 status
= REQUEST_OK
; /* don't complain about it */
751 if (diskaddr
== startaddr
) /* didn't get any further, */
755 vol
->recovered_reads
+= recovered
; /* adjust our recovery count */
761 * Build up a request structure for writes.
762 * Return 0 if all subdisks involved in the request are up, 1 if some
763 * subdisks are not up, and -1 if the request is at least partially
764 * outside the bounds of the subdisks.
767 build_write_request(struct request
*rq
)
771 daddr_t diskstart
; /* offset of current part of transfer */
772 daddr_t diskend
; /* and end offset of transfer */
773 int plexno
; /* plex index in vinum_conf */
774 struct volume
*vol
; /* volume in question */
775 enum requeststatus status
;
777 bio
= rq
->bio
; /* buffer pointer */
779 vol
= &VOL
[rq
->volplex
.volno
]; /* point to volume */
780 diskend
= (daddr_t
)(bio
->bio_offset
>> DEV_BSHIFT
) + (bp
->b_bcount
/ DEV_BSIZE
); /* end offset of transfer */
781 status
= REQUEST_DOWN
; /* assume the worst */
782 for (plexno
= 0; plexno
< vol
->plexes
; plexno
++) {
783 diskstart
= (daddr_t
)(bio
->bio_offset
>> DEV_BSHIFT
); /* start offset of transfer */
785 * Build requests for the plex.
786 * We take the best possible result here (min,
787 * not max): we're happy if we can write at all
789 status
= min(status
, bre(rq
,
797 /* Fill in the struct buf part of a request element. */
799 build_rq_buffer(struct rqelement
*rqe
, struct plex
*plex
)
801 struct sd
*sd
; /* point to subdisk */
804 struct buf
*ubp
; /* user (high level) buffer header */
807 vol
= &VOL
[rqe
->rqg
->rq
->volplex
.volno
];
808 sd
= &SD
[rqe
->sdno
]; /* point to subdisk */
810 ubio
= rqe
->rqg
->rq
->bio
; /* pointer to user buffer header */
813 /* Initialize the buf struct */
814 /* copy these flags from user bp */
815 bp
->b_flags
= ubp
->b_flags
& (B_ORDERED
| B_NOCACHE
| B_ASYNC
);
816 bp
->b_cmd
= ubp
->b_cmd
;
818 if (rqe
->flags
& XFR_BUFLOCKED
) /* paranoia */
819 panic("build_rq_buffer: rqe already locked"); /* XXX remove this when we're sure */
821 BUF_LOCKINIT(bp
); /* get a lock for the buffer */
822 BUF_LOCK(bp
, LK_EXCLUSIVE
); /* and lock it */
825 rqe
->flags
|= XFR_BUFLOCKED
;
826 bp
->b_bio1
.bio_done
= complete_rqe
;
828 * You'd think that we wouldn't need to even
829 * build the request buffer for a dead subdisk,
830 * but in some cases we need information like
831 * the user buffer address. Err on the side of
832 * generosity and supply what we can. That
833 * obviously doesn't include drive information
834 * when the drive is dead.
836 if ((rqe
->flags
& XFR_BAD_SUBDISK
) == 0) /* subdisk is accessible, */
837 bp
->b_bio1
.bio_driver_info
= DRIVE
[rqe
->driveno
].dev
; /* drive device */
838 bp
->b_bio1
.bio_offset
= (off_t
)(rqe
->sdoffset
+ sd
->driveoffset
) << DEV_BSHIFT
; /* start address */
839 bp
->b_bcount
= rqe
->buflen
<< DEV_BSHIFT
; /* number of bytes to transfer */
840 bp
->b_resid
= bp
->b_bcount
; /* and it's still all waiting */
842 if (rqe
->flags
& XFR_MALLOCED
) { /* this operation requires a malloced buffer */
843 bp
->b_data
= Malloc(bp
->b_bcount
); /* get a buffer to put it in */
844 if (bp
->b_data
== NULL
) { /* failed */
845 abortrequest(rqe
->rqg
->rq
, ENOMEM
);
846 return REQUEST_ENOMEM
; /* no memory */
850 * Point directly to user buffer data. This means
851 * that we don't need to do anything when we have
852 * finished the transfer
854 bp
->b_data
= ubp
->b_data
+ rqe
->useroffset
* DEV_BSIZE
;
856 * On a recovery read, we perform an XOR of
857 * all blocks to the user buffer. To make
858 * this work, we first clean out the buffer
860 if ((rqe
->flags
& (XFR_RECOVERY_READ
| XFR_BAD_SUBDISK
))
861 == (XFR_RECOVERY_READ
| XFR_BAD_SUBDISK
)) { /* bad subdisk of a recovery read */
862 int length
= rqe
->grouplen
<< DEV_BSHIFT
; /* and count involved */
863 char *data
= (char *) &rqe
->b
.b_data
[rqe
->groupoffset
<< DEV_BSHIFT
]; /* destination */
865 bzero(data
, length
); /* clean it out */
871 * Abort a request: free resources and complete the
872 * user request with the specified error
875 abortrequest(struct request
*rq
, int error
)
877 struct buf
*bp
= rq
->bio
->bio_buf
; /* user buffer */
880 freerq(rq
); /* free everything we're doing */
881 bp
->b_flags
|= B_ERROR
;
882 return error
; /* and give up */
886 * Check that our transfer will cover the
887 * complete address space of the user request.
889 * Return 1 if it can, otherwise 0
892 check_range_covered(struct request
*rq
)
897 /* Perform I/O on a subdisk */
899 sdio(struct bio
*bio
)
907 struct buf
*bp
= bio
->bio_buf
;
909 dev
= bio
->bio_driver_info
;
912 if (debug
& DEBUG_LASTREQS
)
913 logrq(loginfo_sdio
, (union rqinfou
) bio
, bio
);
915 sd
= &SD
[Sdno(dev
)]; /* point to the subdisk */
916 drive
= &DRIVE
[sd
->driveno
];
918 if (drive
->state
!= drive_up
) {
919 if (sd
->state
>= sd_crashed
) {
920 if (bp
->b_cmd
!= BUF_CMD_READ
) /* writing, */
921 set_sd_state(sd
->sdno
, sd_stale
, setstate_force
);
923 set_sd_state(sd
->sdno
, sd_crashed
, setstate_force
);
926 bp
->b_flags
|= B_ERROR
;
931 * We allow access to any kind of subdisk as long as we can expect
932 * to get the I/O performed.
934 if (sd
->state
< sd_empty
) { /* nothing to talk to, */
936 bp
->b_flags
|= B_ERROR
;
941 sbp
= (struct sdbuf
*) Malloc(sizeof(struct sdbuf
));
943 bp
->b_error
= ENOMEM
;
944 bp
->b_flags
|= B_ERROR
;
948 sddev
= DRIVE
[sd
->driveno
].dev
; /* device */
949 bzero(sbp
, sizeof(struct sdbuf
)); /* start with nothing */
950 sbp
->b
.b_cmd
= bp
->b_cmd
;
951 sbp
->b
.b_bcount
= bp
->b_bcount
; /* number of bytes to transfer */
952 sbp
->b
.b_resid
= bp
->b_resid
; /* and amount waiting */
953 sbp
->b
.b_data
= bp
->b_data
; /* data buffer */
954 BUF_LOCKINIT(&sbp
->b
); /* get a lock for the buffer */
955 BUF_LOCK(&sbp
->b
, LK_EXCLUSIVE
); /* and lock it */
956 BUF_KERNPROC(&sbp
->b
);
958 sbp
->b
.b_bio1
.bio_offset
= bio
->bio_offset
+ ((off_t
)sd
->driveoffset
<< DEV_BSHIFT
);
959 sbp
->b
.b_bio1
.bio_done
= sdio_done
; /* come here on completion */
960 sbp
->bio
= bio
; /* note the address of the original header */
961 sbp
->sdno
= sd
->sdno
; /* note for statistics */
962 sbp
->driveno
= sd
->driveno
;
963 endoffset
= (daddr_t
)(bio
->bio_offset
>> DEV_BSHIFT
) + sbp
->b
.b_bcount
/ DEV_BSIZE
; /* final sector offset */
964 if (endoffset
> sd
->sectors
) { /* beyond the end */
965 sbp
->b
.b_bcount
-= (endoffset
- sd
->sectors
) * DEV_BSIZE
; /* trim */
966 if (sbp
->b
.b_bcount
<= 0) { /* nothing to transfer */
967 bp
->b_resid
= bp
->b_bcount
; /* nothing transferred */
970 BUF_LOCKFREE(&sbp
->b
);
976 if (debug
& DEBUG_ADDRESSES
)
978 " %s dev %d.%d, sd %d, offset 0x%llx, devoffset 0x%llx, length %d\n",
979 (sbp
->b
.b_cmd
== BUF_CMD_READ
) ? "Read" : "Write",
983 sbp
->b
.b_bio1
.bio_offset
- ((off_t
)SD
[sbp
->sdno
].driveoffset
<< DEV_BSHIFT
),
984 sbp
->b
.b_bio1
.bio_offset
,
989 if (debug
& DEBUG_LASTREQS
)
990 logrq(loginfo_sdiol
, (union rqinfou
) &sbp
->b
.b_bio1
, &sbp
->b
.b_bio1
);
992 dev_dstrategy(sddev
, &sbp
->b
.b_bio1
);
997 * Determine the size of the transfer, and make sure it is
998 * within the boundaries of the partition. Adjust transfer
999 * if needed, and signal errors or early completion.
1001 * Volumes are simpler than disk slices: they only contain
1002 * one component (though we call them a, b and c to make
1003 * system utilities happy), and they always take up the
1004 * complete space of the "partition".
1006 * I'm still not happy with this: why should the label be
1007 * protected? If it weren't so damned difficult to write
1008 * one in the first pleace (because it's protected), it wouldn't
1012 vinum_bounds_check(struct bio
*bio
, struct volume
*vol
)
1014 struct buf
*bp
= bio
->bio_buf
;
1016 int maxsize
= vol
->size
; /* size of the partition (sectors) */
1017 int size
= (bp
->b_bcount
+ DEV_BSIZE
- 1) >> DEV_BSHIFT
; /* size of this request (sectors) */
1018 daddr_t blkno
= (daddr_t
)(bio
->bio_offset
>> DEV_BSHIFT
);
1020 if (size
== 0) /* no transfer specified, */
1021 return 0; /* treat as EOF */
1022 /* beyond partition? */
1023 if (bio
->bio_offset
< 0 /* negative start */
1024 || blkno
+ size
> maxsize
) { /* or goes beyond the end of the partition */
1025 /* if exactly at end of disk, return an EOF */
1026 if (blkno
== maxsize
) {
1027 bp
->b_resid
= bp
->b_bcount
;
1030 /* or truncate if part of it fits */
1031 size
= maxsize
- blkno
;
1032 if (size
<= 0) { /* nothing to transfer */
1033 bp
->b_error
= EINVAL
;
1034 bp
->b_flags
|= B_ERROR
;
1037 bp
->b_bcount
= size
<< DEV_BSHIFT
;
1039 nbio
= push_bio(bio
);
1040 nbio
->bio_offset
= bio
->bio_offset
;
1045 * Allocate a request group and hook
1046 * it in in the list for rq
1049 allocrqg(struct request
*rq
, int elements
)
1051 struct rqgroup
*rqg
; /* the one we're going to allocate */
1052 int size
= sizeof(struct rqgroup
) + elements
* sizeof(struct rqelement
);
1054 rqg
= (struct rqgroup
*) Malloc(size
);
1055 if (rqg
!= NULL
) { /* malloc OK, */
1056 if (rq
->rqg
) /* we already have requests */
1057 rq
->lrqg
->next
= rqg
; /* hang it off the end */
1058 else /* first request */
1059 rq
->rqg
= rqg
; /* at the start */
1060 rq
->lrqg
= rqg
; /* this one is the last in the list */
1062 bzero(rqg
, size
); /* no old junk */
1063 rqg
->rq
= rq
; /* point back to the parent request */
1064 rqg
->count
= elements
; /* number of requests in the group */
1065 rqg
->lockbase
= -1; /* no lock required yet */
1071 * Deallocate a request group out of a chain. We do
1072 * this by linear search: the chain is short, this
1073 * almost never happens, and currently it can only
1074 * happen to the first member of the chain.
1077 deallocrqg(struct rqgroup
*rqg
)
1079 struct rqgroup
*rqgc
= rqg
->rq
->rqg
; /* point to the request chain */
1081 if (rqg
->lock
) /* got a lock? */
1082 unlockrange(rqg
->plexno
, rqg
->lock
); /* yes, free it */
1083 if (rqgc
== rqg
) /* we're first in line */
1084 rqg
->rq
->rqg
= rqg
->next
; /* unhook ourselves */
1086 while ((rqgc
->next
!= NULL
) /* find the group */
1087 &&(rqgc
->next
!= rqg
))
1089 if (rqgc
->next
== NULL
)
1091 "vinum deallocrqg: rqg %p not found in request %p\n",
1095 rqgc
->next
= rqg
->next
; /* make the chain jump over us */
1100 /* Local Variables: */
1101 /* fill-column: 50 */