sys/dev/raid/vinum/vinuminterrupt.c

   1 /* vinuminterrupt.c: bottom half of the driver */
   2
   3 /*-
   4  * Copyright (c) 1997, 1998, 1999
   5  *      Nan Yang Computer Services Limited.  All rights reserved.
   6  *
   7  *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
   8  *
   9  *  Written by Greg Lehey
  10  *
  11  *  This software is distributed under the so-called ``Berkeley
  12  *  License'':
  13  *
  14  * Redistribution and use in source and binary forms, with or without
  15  * modification, are permitted provided that the following conditions
  16  * are met:
  17  * 1. Redistributions of source code must retain the above copyright
  18  *    notice, this list of conditions and the following disclaimer.
  19  * 2. Redistributions in binary form must reproduce the above copyright
  20  *    notice, this list of conditions and the following disclaimer in the
  21  *    documentation and/or other materials provided with the distribution.
  22  * 3. All advertising materials mentioning features or use of this software
  23  *    must display the following acknowledgement:
  24  *      This product includes software developed by Nan Yang Computer
  25  *      Services Limited.
  26  * 4. Neither the name of the Company nor the names of its contributors
  27  *    may be used to endorse or promote products derived from this software
  28  *    without specific prior written permission.
  29  *
  30  * This software is provided ``as is'', and any express or implied
  31  * warranties, including, but not limited to, the implied warranties of
  32  * merchantability and fitness for a particular purpose are disclaimed.
  33  * In no event shall the company or contributors be liable for any
  34  * direct, indirect, incidental, special, exemplary, or consequential
  35  * damages (including, but not limited to, procurement of substitute
  36  * goods or services; loss of use, data, or profits; or business
  37  * interruption) however caused and on any theory of liability, whether
  38  * in contract, strict liability, or tort (including negligence or
  39  * otherwise) arising in any way out of the use of this software, even if
  40  * advised of the possibility of such damage.
  41  *
  42  * $Id: vinuminterrupt.c,v 1.12 2000/11/24 03:41:42 grog Exp grog $
  43  * $FreeBSD: src/sys/dev/vinum/vinuminterrupt.c,v 1.25.2.3 2001/05/28 05:56:27 grog Exp $
  44  * $DragonFly: src/sys/dev/raid/vinum/vinuminterrupt.c,v 1.13 2007/08/01 11:46:46 swildner Exp $
  45  */
  46
  47 #include "vinumhdr.h"
  48 #include "request.h"
  49 #include <sys/resourcevar.h>
  50
  51 void complete_raid5_write(struct rqelement *);
  52 void complete_rqe(struct bio *bio);
  53 void sdio_done(struct bio *bio);
  54
  55 /*
  56  * Take a completed buffer, transfer the data back if
  57  * it's a read, and complete the high-level request
  58  * if this is the last subrequest.
  59  *
  60  * The bp parameter is in fact a struct rqelement, which
  61  * includes a couple of extras at the end.
  62  */
  63 void
  64 complete_rqe(struct bio *bio)
  65 {
  66     struct buf *bp = bio->bio_buf;
  67     struct rqelement *rqe;
  68     struct request *rq;
  69     struct rqgroup *rqg;
  70     struct bio *ubio;                                       /* user buffer */
  71     struct drive *drive;
  72     struct sd *sd;
  73     char *gravity;                                          /* for error messages */
  74
  75     rqe = (struct rqelement *) bp;                          /* point to the element that completed */
  76     rqg = rqe->rqg;                                         /* and the request group */
  77     rq = rqg->rq;                                           /* and the complete request */
  78     ubio = rq->bio;                                         /* user buffer */
  79
  80 #ifdef VINUMDEBUG
  81     if (debug & DEBUG_LASTREQS)
  82         logrq(loginfo_iodone, (union rqinfou) rqe, ubio);
  83 #endif
  84     drive = &DRIVE[rqe->driveno];
  85     drive->active--;                                        /* one less outstanding I/O on this drive */
  86     vinum_conf.active--;                                    /* one less outstanding I/O globally */
  87     if ((drive->active == (DRIVE_MAXACTIVE - 1))            /* we were at the drive limit */
  88     ||(vinum_conf.active == VINUM_MAXACTIVE))               /* or the global limit */
  89         wakeup(&launch_requests);                           /* let another one at it */
  90     if ((bp->b_flags & B_ERROR) != 0) {                     /* transfer in error */
  91         gravity = "";
  92         sd = &SD[rqe->sdno];
  93
  94         if (bp->b_error != 0)                               /* did it return a number? */
  95             rq->error = bp->b_error;                        /* yes, put it in. */
  96         else if (rq->error == 0)                            /* no: do we have one already? */
  97             rq->error = EIO;                                /* no: catchall "I/O error" */
  98         sd->lasterror = rq->error;
  99         if (bp->b_cmd == BUF_CMD_READ) {
 100             if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
 101                 gravity = " fatal";
 102                 set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
 103             }
 104             log(LOG_ERR,
 105                 "%s:%s read error, offset %lld for %d bytes\n",
 106                 gravity,
 107                 sd->name,
 108                 (long long)bio->bio_offset,
 109                 bp->b_bcount);
 110         } else {                                            /* write operation */
 111             if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
 112                 gravity = "fatal ";
 113                 set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
 114             }
 115             log(LOG_ERR,
 116                 "%s:%s write error, offset %lld for %d bytes\n",
 117                 gravity,
 118                 sd->name,
 119                 (long long)bio->bio_offset,
 120                 bp->b_bcount);
 121         }
 122         log(LOG_ERR,
 123             "%s: user buffer offset %lld for %d bytes\n",
 124             sd->name,
 125             (long long)ubio->bio_offset,
 126             ubio->bio_buf->b_bcount);
 127         if (rq->error == ENXIO) {                           /* the drive's down too */
 128             log(LOG_ERR,
 129                 "%s: fatal drive I/O error, offset %lld for %d bytes\n",
 130                 DRIVE[rqe->driveno].label.name,
 131                 (long long)bio->bio_offset,
 132                 bp->b_bcount);
 133             DRIVE[rqe->driveno].lasterror = rq->error;
 134             set_drive_state(rqe->driveno,                   /* take the drive down */
 135                 drive_down,
 136                 setstate_force);
 137         }
 138     }
 139     /* Now update the statistics */
 140     if (bp->b_cmd == BUF_CMD_READ) {                            /* read operation */
 141         DRIVE[rqe->driveno].reads++;
 142         DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
 143         SD[rqe->sdno].reads++;
 144         SD[rqe->sdno].bytes_read += bp->b_bcount;
 145         PLEX[rqe->rqg->plexno].reads++;
 146         PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
 147         if (PLEX[rqe->rqg->plexno].volno >= 0) {            /* volume I/O, not plex */
 148             VOL[PLEX[rqe->rqg->plexno].volno].reads++;
 149             VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
 150         }
 151     } else {                                                /* write operation */
 152         DRIVE[rqe->driveno].writes++;
 153         DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
 154         SD[rqe->sdno].writes++;
 155         SD[rqe->sdno].bytes_written += bp->b_bcount;
 156         PLEX[rqe->rqg->plexno].writes++;
 157         PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
 158         if (PLEX[rqe->rqg->plexno].volno >= 0) {            /* volume I/O, not plex */
 159             VOL[PLEX[rqe->rqg->plexno].volno].writes++;
 160             VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
 161         }
 162     }
 163     if (rqg->flags & XFR_RECOVERY_READ) {                   /* recovery read, */
 164         int *sdata;                                         /* source */
 165         int *data;                                          /* and group data */
 166         int length;                                         /* and count involved */
 167         int count;                                          /* loop counter */
 168         struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */
 169
 170         /* XOR destination is the user data */
 171         sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */
 172         data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
 173         length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
 174
 175         for (count = 0; count < length; count++)
 176             data[count] ^= sdata[count];
 177
 178         /*
 179          * In a normal read, we will normally read directly
 180          * into the user buffer.  This doesn't work if
 181          * we're also doing a recovery, so we have to
 182          * copy it
 183          */
 184         if (rqe->flags & XFR_NORMAL_READ) {                 /* normal read as well, */
 185             char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
 186             char *dst;
 187
 188             dst = (char *) ubio->bio_buf->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
 189             length = rqe->datalen << DEV_BSHIFT;            /* and count involved */
 190             bcopy(src, dst, length);                        /* move it */
 191         }
 192     } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation  */
 193     &&(rqg->active == 1))                                   /* and this is the last active request */
 194         complete_raid5_write(rqe);
 195     /*
 196      * This is the earliest place where we can be
 197      * sure that the request has really finished,
 198      * since complete_raid5_write can issue new
 199      * requests.
 200      */
 201     rqg->active--;                                          /* this request now finished */
 202     if (rqg->active == 0) {                                 /* request group finished, */
 203         rq->active--;                                       /* one less */
 204         if (rqg->lock) {                                    /* got a lock? */
 205             unlockrange(rqg->plexno, rqg->lock);            /* yes, free it */
 206             rqg->lock = 0;
 207         }
 208     }
 209     if (rq->active == 0) {                                  /* request finished, */
 210 #ifdef VINUMDEBUG
 211         if (debug & DEBUG_RESID) {
 212             if (ubio->bio_buf->b_resid != 0)                        /* still something to transfer? */
 213                 Debugger("resid");
 214         }
 215 #endif
 216
 217         if (rq->error) {                                    /* did we have an error? */
 218             if (rq->isplex) {                               /* plex operation, */
 219                 ubio->bio_buf->b_flags |= B_ERROR;          /* yes, propagate to user */
 220                 ubio->bio_buf->b_error = rq->error;
 221             } else                                          /* try to recover */
 222                 queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
 223         } else {
 224             ubio->bio_buf->b_resid = 0;                     /* completed our transfer */
 225             if (rq->isplex == 0)                            /* volume request, */
 226                 VOL[rq->volplex.volno].active--;            /* another request finished */
 227             biodone(ubio);                                  /* top level buffer completed */
 228             freerq(rq);                                     /* return the request storage */
 229         }
 230     }
 231 }
 232
 233 /* Free a request block and anything hanging off it */
 234 void
 235 freerq(struct request *rq)
 236 {
 237     struct rqgroup *rqg;
 238     struct rqgroup *nrqg;                                   /* next in chain */
 239     int rqno;
 240
 241     for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) {          /* through the whole request chain */
 242         if (rqg->lock)                                      /* got a lock? */
 243             unlockrange(rqg->plexno, rqg->lock);            /* yes, free it */
 244         for (rqno = 0; rqno < rqg->count; rqno++) {
 245             if ((rqg->rqe[rqno].flags & XFR_MALLOCED)       /* data buffer was malloced, */
 246             &&rqg->rqe[rqno].b.b_data)                      /* and the allocation succeeded */
 247                 Free(rqg->rqe[rqno].b.b_data);              /* free it */
 248             if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) {     /* locked this buffer, */
 249                 BUF_UNLOCK(&rqg->rqe[rqno].b);              /* unlock it again */
 250                 BUF_LOCKFREE(&rqg->rqe[rqno].b);
 251             }
 252         }
 253         nrqg = rqg->next;                                   /* note the next one */
 254         Free(rqg);                                          /* and free this one */
 255     }
 256     Free(rq);                                               /* free the request itself */
 257 }
 258
 259 /* I/O on subdisk completed */
 260 void
 261 sdio_done(struct bio *bio)
 262 {
 263     struct sdbuf *sbp;
 264
 265     sbp = (struct sdbuf *) bio->bio_buf;
 266     if (sbp->b.b_flags & B_ERROR) {                         /* had an error */
 267         sbp->bio->bio_buf->b_flags |= B_ERROR;                      /* propagate upwards */
 268         sbp->bio->bio_buf->b_error = sbp->b.b_error;
 269     }
 270 #ifdef VINUMDEBUG
 271     if (debug & DEBUG_LASTREQS)
 272         logrq(loginfo_sdiodone, (union rqinfou)bio, bio);
 273 #endif
 274     sbp->bio->bio_buf->b_resid = sbp->b.b_resid;                            /* copy the resid field */
 275     /* Now update the statistics */
 276     if (sbp->b.b_cmd == BUF_CMD_READ) {                     /* read operation */
 277         DRIVE[sbp->driveno].reads++;
 278         DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
 279         SD[sbp->sdno].reads++;
 280         SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
 281     } else {                                                /* write operation */
 282         DRIVE[sbp->driveno].writes++;
 283         DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
 284         SD[sbp->sdno].writes++;
 285         SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
 286     }
 287     biodone_sync(bio);
 288     biodone(sbp->bio);                                      /* complete the caller's I/O */
 289     BUF_UNLOCK(&sbp->b);
 290     BUF_LOCKFREE(&sbp->b);
 291     Free(sbp);
 292 }
 293
 294 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */
 295 void
 296 complete_raid5_write(struct rqelement *rqe)
 297 {
 298     int *sdata;                                             /* source */
 299     int *pdata;                                             /* and parity block data */
 300     int length;                                             /* and count involved */
 301     int count;                                              /* loop counter */
 302     int rqno;                                               /* request index */
 303     int rqoffset;                                           /* offset of request data from parity data */
 304     struct bio *ubio;                                       /* user buffer header */
 305     struct request *rq;                                     /* pointer to our request */
 306     struct rqgroup *rqg;                                    /* and to the request group */
 307     struct rqelement *prqe;                                 /* point to the parity block */
 308     struct drive *drive;                                    /* drive to access */
 309     rqg = rqe->rqg;                                         /* and to our request group */
 310     rq = rqg->rq;                                           /* point to our request */
 311     ubio = rq->bio;                                         /* user's buffer header */
 312     prqe = &rqg->rqe[0];                                    /* point to the parity block */
 313
 314     /*
 315      * If we get to this function, we have normal or
 316      * degraded writes, or a combination of both.  We do
 317      * the same thing in each case: we perform an
 318      * exclusive or to the parity block.  The only
 319      * difference is the origin of the data and the
 320      * address range.
 321      */
 322     if (rqe->flags & XFR_DEGRADED_WRITE) {                  /* do the degraded write stuff */
 323         pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
 324         bzero(pdata, prqe->grouplen << DEV_BSHIFT);         /* start with nothing in the parity block */
 325
 326         /* Now get what data we need from each block */
 327         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
 328             rqe = &rqg->rqe[rqno];                          /* this request */
 329             sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
 330             length = rqe->grouplen << (DEV_BSHIFT - 2);     /* and count involved */
 331
 332             /*
 333              * Add the data block to the parity block.  Before
 334              * we started the request, we zeroed the parity
 335              * block, so the result of adding all the other
 336              * blocks and the block we want to write will be
 337              * the correct parity block.
 338              */
 339             for (count = 0; count < length; count++)
 340                 pdata[count] ^= sdata[count];
 341             if ((rqe->flags & XFR_MALLOCED)                 /* the buffer was malloced, */
 342             &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) {     /* and we have no normal write, */
 343                 Free(rqe->b.b_data);                        /* free it now */
 344                 rqe->flags &= ~XFR_MALLOCED;
 345             }
 346         }
 347     }
 348     if (rqg->flags & XFR_NORMAL_WRITE) {                    /* do normal write stuff */
 349         /* Get what data we need from each block */
 350         for (rqno = 1; rqno < rqg->count; rqno++) {         /* for all the data blocks */
 351             rqe = &rqg->rqe[rqno];                          /* this request */
 352             if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
 353                 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) {   /* good data block to write */
 354                 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
 355                 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
 356                 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
 357                 length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
 358
 359                 /*
 360                  * "remove" the old data block
 361                  * from the parity block
 362                  */
 363                 if ((pdata < ((int *) prqe->b.b_data))
 364                     || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
 365                     || (sdata < ((int *) rqe->b.b_data))
 366                     || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
 367                     panic("complete_raid5_write: bounds overflow");
 368                 for (count = 0; count < length; count++)
 369                     pdata[count] ^= sdata[count];
 370
 371                 /* "add" the new data block */
 372                 sdata = (int *) (&ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
 373                 if ((sdata < ((int *) ubio->bio_buf->b_data))
 374                     || (&sdata[length] > ((int *) (ubio->bio_buf->b_data + ubio->bio_buf->b_bcount))))
 375                     panic("complete_raid5_write: bounds overflow");
 376                 for (count = 0; count < length; count++)
 377                     pdata[count] ^= sdata[count];
 378
 379                 /* Free the malloced buffer */
 380                 if (rqe->flags & XFR_MALLOCED) {            /* the buffer was malloced, */
 381                     Free(rqe->b.b_data);                    /* free it */
 382                     rqe->flags &= ~XFR_MALLOCED;
 383                 } else
 384                     panic("complete_raid5_write: malloc conflict");
 385
 386                 if ((rqe->b.b_cmd == BUF_CMD_READ)          /* this was a read */
 387                 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) {  /* and we can write this block */
 388                     rqe->b.b_cmd = BUF_CMD_WRITE;   /* we're writing now */
 389                     rqe->b.b_bio1.bio_done = complete_rqe;          /* by calling us here */
 390                     rqe->flags &= ~XFR_PARITYOP;            /* reset flags that brought us here */
 391                     rqe->b.b_data = &ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
 392                     rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
 393                     rqe->b.b_resid = rqe->b.b_bcount;       /* nothing transferred */
 394                     rqe->b.b_bio1.bio_offset += (off_t)rqe->dataoffset << DEV_BSHIFT;       /* point to the correct block */
 395                     drive = &DRIVE[rqe->driveno];           /* drive to access */
 396                     rqe->b.b_bio1.bio_driver_info = drive->dev;
 397                     rqg->active++;                          /* another active request */
 398
 399                                                             /* We can't sleep here, so we just increment the counters. */
 400                     drive->active++;
 401                     if (drive->active >= drive->maxactive)
 402                         drive->maxactive = drive->active;
 403                     vinum_conf.active++;
 404                     if (vinum_conf.active >= vinum_conf.maxactive)
 405                         vinum_conf.maxactive = vinum_conf.active;
 406 #if VINUMDEBUG
 407                     if (debug & DEBUG_ADDRESSES)
 408                         log(LOG_DEBUG,
 409                             "  %s dev %s, sd %d, offset 0x%llx, devoffset 0x%llx, length %d\n",
 410                             (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write",
 411                             drive->devicename,
 412                             rqe->sdno,
 413                             rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT),
 414                             rqe->b.b_bio1.bio_offset,
 415                             rqe->b.b_bcount);
 416                     if (debug & DEBUG_LASTREQS)
 417                         logrq(loginfo_raid5_data, (union rqinfou) rqe, ubio);
 418 #endif
 419                     vn_strategy(drive->vp, &rqe->b.b_bio1);
 420                 }
 421             }
 422         }
 423     }
 424     /* Finally, write the parity block */
 425     rqe = &rqg->rqe[0];
 426     rqe->b.b_cmd = BUF_CMD_WRITE;                   /* we're writing now */
 427     rqe->b.b_bio1.bio_done = complete_rqe;                          /* by calling us here */
 428     rqg->flags &= ~XFR_PARITYOP;                            /* reset flags that brought us here */
 429     rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT;            /* length to write */
 430     rqe->b.b_resid = rqe->b.b_bcount;                       /* nothing transferred */
 431     drive = &DRIVE[rqe->driveno];                           /* drive to access */
 432     rqe->b.b_bio1.bio_driver_info = drive->dev;
 433     rqg->active++;                                          /* another active request */
 434
 435     /* We can't sleep here, so we just increment the counters. */
 436     drive->active++;
 437     if (drive->active >= drive->maxactive)
 438         drive->maxactive = drive->active;
 439     vinum_conf.active++;
 440     if (vinum_conf.active >= vinum_conf.maxactive)
 441         vinum_conf.maxactive = vinum_conf.active;
 442
 443 #if VINUMDEBUG
 444     if (debug & DEBUG_ADDRESSES)
 445         log(LOG_DEBUG,
 446             "  %s dev %s, sd %d, offset 0x%llx, devoffset 0x%llx, length %d\n",
 447             (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write",
 448             drive->devicename,
 449             rqe->sdno,
 450             rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT),
 451             rqe->b.b_bio1.bio_offset,
 452             rqe->b.b_bcount);
 453     if (debug & DEBUG_LASTREQS)
 454         logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubio);
 455 #endif
 456     vn_strategy(drive->vp, &rqe->b.b_bio1);
 457 }
 458
 459 /* Local Variables: */
 460 /* fill-column: 50 */
 461 /* End: */