usr/src/uts/common/os/bio.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2011 Joyent, Inc.  All rights reserved.
  25  */
  26
  27 /*
  28  * Copyright (c) 2016 by Delphix. All rights reserved.
  29  */
  30
  31 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  32 /*        All Rights Reserved   */
  33
  34 /*
  35  * University Copyright- Copyright (c) 1982, 1986, 1988
  36  * The Regents of the University of California
  37  * All Rights Reserved
  38  *
  39  * University Acknowledgment- Portions of this document are derived from
  40  * software developed by the University of California, Berkeley, and its
  41  * contributors.
  42  */
  43
  44 #include <sys/types.h>
  45 #include <sys/t_lock.h>
  46 #include <sys/sysmacros.h>
  47 #include <sys/conf.h>
  48 #include <sys/cpuvar.h>
  49 #include <sys/errno.h>
  50 #include <sys/debug.h>
  51 #include <sys/buf.h>
  52 #include <sys/var.h>
  53 #include <sys/vnode.h>
  54 #include <sys/bitmap.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/kmem.h>
  57 #include <sys/vmem.h>
  58 #include <sys/atomic.h>
  59 #include <vm/seg_kmem.h>
  60 #include <vm/page.h>
  61 #include <vm/pvn.h>
  62 #include <sys/vtrace.h>
  63 #include <sys/tnf_probe.h>
  64 #include <sys/fs/ufs_inode.h>
  65 #include <sys/fs/ufs_bio.h>
  66 #include <sys/fs/ufs_log.h>
  67 #include <sys/systm.h>
  68 #include <sys/vfs.h>
  69 #include <sys/sdt.h>
  70
  71 /* Locks */
  72 static  kmutex_t        blist_lock;     /* protects b_list */
  73 static  kmutex_t        bhdr_lock;      /* protects the bhdrlist */
  74 static  kmutex_t        bfree_lock;     /* protects the bfreelist structure */
  75
  76 struct hbuf     *hbuf;                  /* Hash buckets */
  77 struct dwbuf    *dwbuf;                 /* Delayed write buckets */
  78 static struct buf *bhdrlist;            /* buf header free list */
  79 static int      nbuf;                   /* number of buffer headers allocated */
  80
  81 static int      lastindex;              /* Reference point on where to start */
  82                                         /* when looking for free buffers */
  83
  84 #define bio_bhash(dev, bn)      (hash2ints((dev), (int)(bn)) & v.v_hmask)
  85 #define EMPTY_LIST      ((struct buf *)-1)
  86
  87 static kcondvar_t       bio_mem_cv;     /* Condition variables */
  88 static kcondvar_t       bio_flushinval_cv;
  89 static int      bio_doingflush;         /* flush in progress */
  90 static int      bio_doinginval;         /* inval in progress */
  91 static int      bio_flinv_cv_wanted;    /* someone waiting for cv */
  92
  93 /*
  94  * Statistics on the buffer cache
  95  */
  96 struct biostats biostats = {
  97         { "buffer_cache_lookups",               KSTAT_DATA_UINT32 },
  98         { "buffer_cache_hits",                  KSTAT_DATA_UINT32 },
  99         { "new_buffer_requests",                KSTAT_DATA_UINT32 },
 100         { "waits_for_buffer_allocs",            KSTAT_DATA_UINT32 },
 101         { "buffers_locked_by_someone",          KSTAT_DATA_UINT32 },
 102         { "duplicate_buffers_found",            KSTAT_DATA_UINT32 }
 103 };
 104
 105 /*
 106  * kstat data
 107  */
 108 kstat_named_t   *biostats_ptr = (kstat_named_t *)&biostats;
 109 uint_t          biostats_ndata = (uint_t)(sizeof (biostats) /
 110                                         sizeof (kstat_named_t));
 111
 112 /*
 113  * Statistics on ufs buffer cache
 114  * Not protected by locks
 115  */
 116 struct ufsbiostats ub = {
 117         { "breads",                     KSTAT_DATA_UINT32 },
 118         { "bwrites",                    KSTAT_DATA_UINT32 },
 119         { "fbiwrites",                  KSTAT_DATA_UINT32 },
 120         { "getpages",                   KSTAT_DATA_UINT32 },
 121         { "getras",                     KSTAT_DATA_UINT32 },
 122         { "putsyncs",                   KSTAT_DATA_UINT32 },
 123         { "putasyncs",                  KSTAT_DATA_UINT32 },
 124         { "putpageios",                 KSTAT_DATA_UINT32 },
 125 };
 126
 127 /*
 128  * more UFS Logging eccentricities...
 129  *
 130  * required since "#pragma weak ..." doesn't work in reverse order.
 131  * i.e.:  genunix (bio.c) is loaded before the ufs modules and pointers
 132  *        to ufs routines don't get plugged into bio.c calls so
 133  *        we initialize it when setting up the "lufsops" table
 134  *        in "lufs.c:_init()"
 135  */
 136 void (*bio_lufs_strategy)(void *, buf_t *);
 137 void (*bio_snapshot_strategy)(void *, buf_t *);
 138
 139
 140 /* Private routines */
 141 static struct buf       *bio_getfreeblk(long);
 142 static void             bio_mem_get(long);
 143 static void             bio_bhdr_free(struct buf *);
 144 static struct buf       *bio_bhdr_alloc(void);
 145 static void             bio_recycle(int, long);
 146 static void             bio_pageio_done(struct buf *);
 147 static int              bio_incore(dev_t, daddr_t);
 148
 149 /*
 150  * Buffer cache constants
 151  */
 152 #define BIO_BUF_PERCENT (100/2)         /* default: 2% of memory */
 153 #define BIO_MAX_PERCENT (100/20)        /* max is 20% of real memory */
 154 #define BIO_BHDR_POOL   100             /* Default bhdr pool size */
 155 #define BIO_MIN_HDR     10              /* Minimum number of buffer headers */
 156 #define BIO_MIN_HWM     (BIO_MIN_HDR * MAXBSIZE / 1024)
 157 #define BIO_HASHLEN     4               /* Target length of hash chains */
 158
 159
 160 /* Flags for bio_recycle() */
 161 #define BIO_HEADER      0x01
 162 #define BIO_MEM         0x02
 163
 164 extern  int bufhwm;             /* User tunable - high water mark for mem  */
 165 extern  int bufhwm_pct;         /* ditto - given in % of physmem  */
 166
 167 /*
 168  * The following routines allocate and free
 169  * buffers with various side effects.  In general the
 170  * arguments to an allocate routine are a device and
 171  * a block number, and the value is a pointer to
 172  * to the buffer header; the buffer returned is locked with a
 173  * binary semaphore so that no one else can touch it. If the block was
 174  * already in core, no I/O need be done; if it is
 175  * already locked, the process waits until it becomes free.
 176  * The following routines allocate a buffer:
 177  *      getblk
 178  *      bread/BREAD
 179  *      breada
 180  * Eventually the buffer must be released, possibly with the
 181  * side effect of writing it out, by using one of
 182  *      bwrite/BWRITE/brwrite
 183  *      bdwrite/bdrwrite
 184  *      bawrite
 185  *      brelse
 186  *
 187  * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
 188  * Instead, a binary semaphore, b_sem is used to gain exclusive access to
 189  * a buffer and a binary semaphore, b_io is used for I/O synchronization.
 190  * B_DONE is still used to denote a buffer with I/O complete on it.
 191  *
 192  * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
 193  * should not be used where a very accurate count of the free buffers is
 194  * needed.
 195  */
 196
 197 /*
 198  * Read in (if necessary) the block and return a buffer pointer.
 199  *
 200  * This interface is provided for binary compatibility.  Using
 201  * BREAD() directly avoids the extra function call overhead invoked
 202  * by calling this routine.
 203  */
 204 struct buf *
 205 bread(dev_t dev, daddr_t blkno, long bsize)
 206 {
 207         return (BREAD(dev, blkno, bsize));
 208 }
 209
 210 /*
 211  * Common code for reading a buffer with various options
 212  *
 213  * Read in (if necessary) the block and return a buffer pointer.
 214  */
 215 struct buf *
 216 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
 217 {
 218         struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
 219         struct buf *bp;
 220         klwp_t *lwp = ttolwp(curthread);
 221
 222         CPU_STATS_ADD_K(sys, lread, 1);
 223         bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
 224         if (bp->b_flags & B_DONE)
 225                 return (bp);
 226         bp->b_flags |= B_READ;
 227         ASSERT(bp->b_bcount == bsize);
 228         if (ufsvfsp == NULL) {                                  /* !ufs */
 229                 (void) bdev_strategy(bp);
 230         } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
 231                                                         /* ufs && logging */
 232                 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
 233         } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
 234                                                         /* ufs && snapshots */
 235                 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
 236         } else {
 237                 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 238                 ub.ub_breads.value.ul++;                /* ufs && !logging */
 239                 (void) bdev_strategy(bp);
 240         }
 241         if (lwp != NULL)
 242                 lwp->lwp_ru.inblock++;
 243         CPU_STATS_ADD_K(sys, bread, 1);
 244         (void) biowait(bp);
 245         return (bp);
 246 }
 247
 248 /*
 249  * Read in the block, like bread, but also start I/O on the
 250  * read-ahead block (which is not allocated to the caller).
 251  */
 252 struct buf *
 253 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
 254 {
 255         struct buf *bp, *rabp;
 256         klwp_t *lwp = ttolwp(curthread);
 257
 258         bp = NULL;
 259         if (!bio_incore(dev, blkno)) {
 260                 CPU_STATS_ADD_K(sys, lread, 1);
 261                 bp = GETBLK(dev, blkno, bsize);
 262                 if ((bp->b_flags & B_DONE) == 0) {
 263                         bp->b_flags |= B_READ;
 264                         bp->b_bcount = bsize;
 265                         (void) bdev_strategy(bp);
 266                         if (lwp != NULL)
 267                                 lwp->lwp_ru.inblock++;
 268                         CPU_STATS_ADD_K(sys, bread, 1);
 269                 }
 270         }
 271         if (rablkno && bfreelist.b_bcount > 1 &&
 272             !bio_incore(dev, rablkno)) {
 273                 rabp = GETBLK(dev, rablkno, bsize);
 274                 if (rabp->b_flags & B_DONE)
 275                         brelse(rabp);
 276                 else {
 277                         rabp->b_flags |= B_READ|B_ASYNC;
 278                         rabp->b_bcount = bsize;
 279                         (void) bdev_strategy(rabp);
 280                         if (lwp != NULL)
 281                                 lwp->lwp_ru.inblock++;
 282                         CPU_STATS_ADD_K(sys, bread, 1);
 283                 }
 284         }
 285         if (bp == NULL)
 286                 return (BREAD(dev, blkno, bsize));
 287         (void) biowait(bp);
 288         return (bp);
 289 }
 290
 291 /*
 292  * Common code for writing a buffer with various options.
 293  *
 294  * force_wait  - wait for write completion regardless of B_ASYNC flag
 295  * do_relse    - release the buffer when we are done
 296  * clear_flags - flags to clear from the buffer
 297  */
 298 void
 299 bwrite_common(void *arg, struct buf *bp, int force_wait,
 300     int do_relse, int clear_flags)
 301 {
 302         register int do_wait;
 303         struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
 304         int flag;
 305         klwp_t *lwp = ttolwp(curthread);
 306         struct cpu *cpup;
 307
 308         ASSERT(SEMA_HELD(&bp->b_sem));
 309         flag = bp->b_flags;
 310         bp->b_flags &= ~clear_flags;
 311         if (lwp != NULL)
 312                 lwp->lwp_ru.oublock++;
 313         CPU_STATS_ENTER_K();
 314         cpup = CPU;             /* get pointer AFTER preemption is disabled */
 315         CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
 316         CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
 317         do_wait = ((flag & B_ASYNC) == 0 || force_wait);
 318         if (do_wait == 0)
 319                 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
 320         CPU_STATS_EXIT_K();
 321         if (ufsvfsp == NULL) {
 322                 (void) bdev_strategy(bp);
 323         } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
 324                                                         /* ufs && logging */
 325                 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
 326         } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
 327                                                         /* ufs && snapshots */
 328                 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
 329         } else {
 330                 ub.ub_bwrites.value.ul++;               /* ufs && !logging */
 331                 (void) bdev_strategy(bp);
 332         }
 333         if (do_wait) {
 334                 (void) biowait(bp);
 335                 if (do_relse) {
 336                         brelse(bp);
 337                 }
 338         }
 339 }
 340
 341 /*
 342  * Write the buffer, waiting for completion (unless B_ASYNC is set).
 343  * Then release the buffer.
 344  * This interface is provided for binary compatibility.  Using
 345  * BWRITE() directly avoids the extra function call overhead invoked
 346  * by calling this routine.
 347  */
 348 void
 349 bwrite(struct buf *bp)
 350 {
 351         BWRITE(bp);
 352 }
 353
 354 /*
 355  * Write the buffer, waiting for completion.
 356  * But don't release the buffer afterwards.
 357  * This interface is provided for binary compatibility.  Using
 358  * BWRITE2() directly avoids the extra function call overhead.
 359  */
 360 void
 361 bwrite2(struct buf *bp)
 362 {
 363         BWRITE2(bp);
 364 }
 365
 366 /*
 367  * Release the buffer, marking it so that if it is grabbed
 368  * for another purpose it will be written out before being
 369  * given up (e.g. when writing a partial block where it is
 370  * assumed that another write for the same block will soon follow).
 371  * Also save the time that the block is first marked as delayed
 372  * so that it will be written in a reasonable time.
 373  */
 374 void
 375 bdwrite(struct buf *bp)
 376 {
 377         ASSERT(SEMA_HELD(&bp->b_sem));
 378         CPU_STATS_ADD_K(sys, lwrite, 1);
 379         if ((bp->b_flags & B_DELWRI) == 0)
 380                 bp->b_start = ddi_get_lbolt();
 381         /*
 382          * B_DONE allows others to use the buffer, B_DELWRI causes the
 383          * buffer to be written before being reused, and setting b_resid
 384          * to zero says the buffer is complete.
 385          */
 386         bp->b_flags |= B_DELWRI | B_DONE;
 387         bp->b_resid = 0;
 388         brelse(bp);
 389 }
 390
 391 /*
 392  * Release the buffer, start I/O on it, but don't wait for completion.
 393  */
 394 void
 395 bawrite(struct buf *bp)
 396 {
 397         ASSERT(SEMA_HELD(&bp->b_sem));
 398
 399         /* Use bfreelist.b_bcount as a weird-ass heuristic */
 400         if (bfreelist.b_bcount > 4)
 401                 bp->b_flags |= B_ASYNC;
 402         BWRITE(bp);
 403 }
 404
 405 /*
 406  * Release the buffer, with no I/O implied.
 407  */
 408 void
 409 brelse(struct buf *bp)
 410 {
 411         struct buf      **backp;
 412         uint_t          index;
 413         kmutex_t        *hmp;
 414         struct  buf     *dp;
 415         struct  hbuf    *hp;
 416
 417
 418         ASSERT(SEMA_HELD(&bp->b_sem));
 419
 420         /*
 421          * Clear the retry write flag if the buffer was written without
 422          * error.  The presence of B_DELWRI means the buffer has not yet
 423          * been written and the presence of B_ERROR means that an error
 424          * is still occurring.
 425          */
 426         if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
 427                 bp->b_flags &= ~B_RETRYWRI;
 428         }
 429
 430         /* Check for anomalous conditions */
 431         if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
 432                 if (bp->b_flags & B_NOCACHE) {
 433                         /* Don't add to the freelist. Destroy it now */
 434                         kmem_free(bp->b_un.b_addr, bp->b_bufsize);
 435                         sema_destroy(&bp->b_sem);
 436                         sema_destroy(&bp->b_io);
 437                         kmem_free(bp, sizeof (struct buf));
 438                         return;
 439                 }
 440                 /*
 441                  * If a write failed and we are supposed to retry write,
 442                  * don't toss the buffer.  Keep it around and mark it
 443                  * delayed write in the hopes that it will eventually
 444                  * get flushed (and still keep the system running.)
 445                  */
 446                 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
 447                         bp->b_flags |= B_DELWRI;
 448                         /* keep fsflush from trying continuously to flush */
 449                         bp->b_start = ddi_get_lbolt();
 450                 } else
 451                         bp->b_flags |= B_AGE|B_STALE;
 452                 bp->b_flags &= ~B_ERROR;
 453                 bp->b_error = 0;
 454         }
 455
 456         /*
 457          * If delayed write is set then put in on the delayed
 458          * write list instead of the free buffer list.
 459          */
 460         index = bio_bhash(bp->b_edev, bp->b_blkno);
 461         hmp   = &hbuf[index].b_lock;
 462
 463         mutex_enter(hmp);
 464         hp = &hbuf[index];
 465         dp = (struct buf *)hp;
 466
 467         /*
 468          * Make sure that the number of entries on this list are
 469          * Zero <= count <= total # buffers
 470          */
 471         ASSERT(hp->b_length >= 0);
 472         ASSERT(hp->b_length < nbuf);
 473
 474         hp->b_length++;         /* We are adding this buffer */
 475
 476         if (bp->b_flags & B_DELWRI) {
 477                 /*
 478                  * This buffer goes on the delayed write buffer list
 479                  */
 480                 dp = (struct buf *)&dwbuf[index];
 481         }
 482         ASSERT(bp->b_bufsize > 0);
 483         ASSERT(bp->b_bcount > 0);
 484         ASSERT(bp->b_un.b_addr != NULL);
 485
 486         if (bp->b_flags & B_AGE) {
 487                 backp = &dp->av_forw;
 488                 (*backp)->av_back = bp;
 489                 bp->av_forw = *backp;
 490                 *backp = bp;
 491                 bp->av_back = dp;
 492         } else {
 493                 backp = &dp->av_back;
 494                 (*backp)->av_forw = bp;
 495                 bp->av_back = *backp;
 496                 *backp = bp;
 497                 bp->av_forw = dp;
 498         }
 499         mutex_exit(hmp);
 500
 501         if (bfreelist.b_flags & B_WANTED) {
 502                 /*
 503                  * Should come here very very rarely.
 504                  */
 505                 mutex_enter(&bfree_lock);
 506                 if (bfreelist.b_flags & B_WANTED) {
 507                         bfreelist.b_flags &= ~B_WANTED;
 508                         cv_broadcast(&bio_mem_cv);
 509                 }
 510                 mutex_exit(&bfree_lock);
 511         }
 512
 513         bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
 514         /*
 515          * Don't let anyone get the buffer off the freelist before we
 516          * release our hold on it.
 517          */
 518         sema_v(&bp->b_sem);
 519 }
 520
 521 /*
 522  * Return a count of the number of B_BUSY buffers in the system
 523  * Can only be used as a good estimate.  If 'cleanit' is set,
 524  * try to flush all bufs.
 525  */
 526 int
 527 bio_busy(int cleanit)
 528 {
 529         struct buf *bp, *dp;
 530         int busy = 0;
 531         int i;
 532         kmutex_t *hmp;
 533
 534         for (i = 0; i < v.v_hbuf; i++) {
 535                 dp = (struct buf *)&hbuf[i];
 536                 hmp = &hbuf[i].b_lock;
 537
 538                 mutex_enter(hmp);
 539                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 540                         if (bp->b_flags & B_BUSY)
 541                                 busy++;
 542                 }
 543                 mutex_exit(hmp);
 544         }
 545
 546         if (cleanit && busy != 0) {
 547                 bflush(NODEV);
 548         }
 549
 550         return (busy);
 551 }
 552
 553 /*
 554  * this interface is provided for binary compatibility.
 555  *
 556  * Assign a buffer for the given block.  If the appropriate
 557  * block is already associated, return it; otherwise search
 558  * for the oldest non-busy buffer and reassign it.
 559  */
 560 struct buf *
 561 getblk(dev_t dev, daddr_t blkno, long bsize)
 562 {
 563         return (getblk_common(/* ufsvfsp */ NULL, dev,
 564             blkno, bsize, /* errflg */ 0));
 565 }
 566
 567 /*
 568  * Assign a buffer for the given block.  If the appropriate
 569  * block is already associated, return it; otherwise search
 570  * for the oldest non-busy buffer and reassign it.
 571  */
 572 struct buf *
 573 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
 574 {
 575         ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
 576         struct buf *bp;
 577         struct buf *dp;
 578         struct buf *nbp = NULL;
 579         struct buf *errbp;
 580         uint_t          index;
 581         kmutex_t        *hmp;
 582         struct  hbuf    *hp;
 583
 584         if (getmajor(dev) >= devcnt)
 585                 cmn_err(CE_PANIC, "blkdev");
 586
 587         biostats.bio_lookup.value.ui32++;
 588
 589         index = bio_bhash(dev, blkno);
 590         hp    = &hbuf[index];
 591         dp    = (struct buf *)hp;
 592         hmp   = &hp->b_lock;
 593
 594         mutex_enter(hmp);
 595 loop:
 596         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 597                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 598                     (bp->b_flags & B_STALE))
 599                         continue;
 600                 /*
 601                  * Avoid holding the hash lock in the event that
 602                  * the buffer is locked by someone. Since the hash chain
 603                  * may change when we drop the hash lock
 604                  * we have to start at the beginning of the chain if the
 605                  * buffer identity/contents aren't valid.
 606                  */
 607                 if (!sema_tryp(&bp->b_sem)) {
 608                         biostats.bio_bufbusy.value.ui32++;
 609                         mutex_exit(hmp);
 610                         /*
 611                          * OK, we are dealing with a busy buffer.
 612                          * In the case that we are panicking and we
 613                          * got called from bread(), we have some chance
 614                          * for error recovery. So better bail out from
 615                          * here since sema_p() won't block. If we got
 616                          * called directly from ufs routines, there is
 617                          * no way to report an error yet.
 618                          */
 619                         if (panicstr && errflg)
 620                                 goto errout;
 621                         /*
 622                          * For the following line of code to work
 623                          * correctly never kmem_free the buffer "header".
 624                          */
 625                         sema_p(&bp->b_sem);
 626                         if (bp->b_blkno != blkno || bp->b_edev != dev ||
 627                             (bp->b_flags & B_STALE)) {
 628                                 sema_v(&bp->b_sem);
 629                                 mutex_enter(hmp);
 630                                 goto loop;      /* start over */
 631                         }
 632                         mutex_enter(hmp);
 633                 }
 634                 /* Found */
 635                 biostats.bio_hit.value.ui32++;
 636                 bp->b_flags &= ~B_AGE;
 637
 638                 /*
 639                  * Yank it off the free/delayed write lists
 640                  */
 641                 hp->b_length--;
 642                 notavail(bp);
 643                 mutex_exit(hmp);
 644
 645                 ASSERT((bp->b_flags & B_NOCACHE) == NULL);
 646
 647                 if (nbp == NULL) {
 648                         /*
 649                          * Make the common path short.
 650                          */
 651                         ASSERT(SEMA_HELD(&bp->b_sem));
 652                         return (bp);
 653                 }
 654
 655                 biostats.bio_bufdup.value.ui32++;
 656
 657                 /*
 658                  * The buffer must have entered during the lock upgrade
 659                  * so free the new buffer we allocated and return the
 660                  * found buffer.
 661                  */
 662                 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
 663                 nbp->b_un.b_addr = NULL;
 664
 665                 /*
 666                  * Account for the memory
 667                  */
 668                 mutex_enter(&bfree_lock);
 669                 bfreelist.b_bufsize += nbp->b_bufsize;
 670                 mutex_exit(&bfree_lock);
 671
 672                 /*
 673                  * Destroy buf identity, and place on avail list
 674                  */
 675                 nbp->b_dev = (o_dev_t)NODEV;
 676                 nbp->b_edev = NODEV;
 677                 nbp->b_flags = 0;
 678                 nbp->b_file = NULL;
 679                 nbp->b_offset = -1;
 680
 681                 sema_v(&nbp->b_sem);
 682                 bio_bhdr_free(nbp);
 683
 684                 ASSERT(SEMA_HELD(&bp->b_sem));
 685                 return (bp);
 686         }
 687
 688         /*
 689          * bio_getfreeblk may block so check the hash chain again.
 690          */
 691         if (nbp == NULL) {
 692                 mutex_exit(hmp);
 693                 nbp = bio_getfreeblk(bsize);
 694                 mutex_enter(hmp);
 695                 goto loop;
 696         }
 697
 698         /*
 699          * New buffer. Assign nbp and stick it on the hash.
 700          */
 701         nbp->b_flags = B_BUSY;
 702         nbp->b_edev = dev;
 703         nbp->b_dev = (o_dev_t)cmpdev(dev);
 704         nbp->b_blkno = blkno;
 705         nbp->b_iodone = NULL;
 706         nbp->b_bcount = bsize;
 707         /*
 708          * If we are given a ufsvfsp and the vfs_root field is NULL
 709          * then this must be I/O for a superblock.  A superblock's
 710          * buffer is set up in mountfs() and there is no root vnode
 711          * at that point.
 712          */
 713         if (ufsvfsp && ufsvfsp->vfs_root) {
 714                 nbp->b_vp = ufsvfsp->vfs_root;
 715         } else {
 716                 nbp->b_vp = NULL;
 717         }
 718
 719         ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
 720
 721         binshash(nbp, dp);
 722         mutex_exit(hmp);
 723
 724         ASSERT(SEMA_HELD(&nbp->b_sem));
 725
 726         return (nbp);
 727
 728
 729         /*
 730          * Come here in case of an internal error. At this point we couldn't
 731          * get a buffer, but we have to return one. Hence we allocate some
 732          * kind of error reply buffer on the fly. This buffer is marked as
 733          * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
 734          *      - B_ERROR will indicate error to the caller.
 735          *      - B_DONE will prevent us from reading the buffer from
 736          *        the device.
 737          *      - B_NOCACHE will cause that this buffer gets free'd in
 738          *        brelse().
 739          */
 740
 741 errout:
 742         errbp = geteblk();
 743         sema_p(&errbp->b_sem);
 744         errbp->b_flags &= ~B_BUSY;
 745         errbp->b_flags |= (B_ERROR | B_DONE);
 746         return (errbp);
 747 }
 748
 749 /*
 750  * Get an empty block, not assigned to any particular device.
 751  * Returns a locked buffer that is not on any hash or free list.
 752  */
 753 struct buf *
 754 ngeteblk(long bsize)
 755 {
 756         struct buf *bp;
 757
 758         bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
 759         bioinit(bp);
 760         bp->av_forw = bp->av_back = NULL;
 761         bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
 762         bp->b_bufsize = bsize;
 763         bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
 764         bp->b_dev = (o_dev_t)NODEV;
 765         bp->b_edev = NODEV;
 766         bp->b_lblkno = 0;
 767         bp->b_bcount = bsize;
 768         bp->b_iodone = NULL;
 769         return (bp);
 770 }
 771
 772 /*
 773  * Interface of geteblk() is kept intact to maintain driver compatibility.
 774  * Use ngeteblk() to allocate block size other than 1 KB.
 775  */
 776 struct buf *
 777 geteblk(void)
 778 {
 779         return (ngeteblk((long)1024));
 780 }
 781
 782 /*
 783  * Return a buffer w/o sleeping
 784  */
 785 struct buf *
 786 trygetblk(dev_t dev, daddr_t blkno)
 787 {
 788         struct buf      *bp;
 789         struct buf      *dp;
 790         struct hbuf     *hp;
 791         kmutex_t        *hmp;
 792         uint_t          index;
 793
 794         index = bio_bhash(dev, blkno);
 795         hp = &hbuf[index];
 796         hmp = &hp->b_lock;
 797
 798         if (!mutex_tryenter(hmp))
 799                 return (NULL);
 800
 801         dp = (struct buf *)hp;
 802         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 803                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 804                     (bp->b_flags & B_STALE))
 805                         continue;
 806                 /*
 807                  * Get access to a valid buffer without sleeping
 808                  */
 809                 if (sema_tryp(&bp->b_sem)) {
 810                         if (bp->b_flags & B_DONE) {
 811                                 hp->b_length--;
 812                                 notavail(bp);
 813                                 mutex_exit(hmp);
 814                                 return (bp);
 815                         } else {
 816                                 sema_v(&bp->b_sem);
 817                                 break;
 818                         }
 819                 }
 820                 break;
 821         }
 822         mutex_exit(hmp);
 823         return (NULL);
 824 }
 825
 826 /*
 827  * Wait for I/O completion on the buffer; return errors
 828  * to the user.
 829  */
 830 int
 831 iowait(struct buf *bp)
 832 {
 833         ASSERT(SEMA_HELD(&bp->b_sem));
 834         return (biowait(bp));
 835 }
 836
 837 /*
 838  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
 839  * and wake up anyone waiting for it.
 840  */
 841 void
 842 iodone(struct buf *bp)
 843 {
 844         ASSERT(SEMA_HELD(&bp->b_sem));
 845         (void) biodone(bp);
 846 }
 847
 848 /*
 849  * Zero the core associated with a buffer.
 850  */
 851 void
 852 clrbuf(struct buf *bp)
 853 {
 854         ASSERT(SEMA_HELD(&bp->b_sem));
 855         bzero(bp->b_un.b_addr, bp->b_bcount);
 856         bp->b_resid = 0;
 857 }
 858
 859
 860 /*
 861  * Make sure all write-behind blocks on dev (or NODEV for all)
 862  * are flushed out.
 863  */
 864 void
 865 bflush(dev_t dev)
 866 {
 867         struct buf *bp, *dp;
 868         struct hbuf *hp;
 869         struct buf *delwri_list = EMPTY_LIST;
 870         int i, index;
 871         kmutex_t *hmp;
 872
 873         mutex_enter(&blist_lock);
 874         /*
 875          * Wait for any invalidates or flushes ahead of us to finish.
 876          * We really could split blist_lock up per device for better
 877          * parallelism here.
 878          */
 879         while (bio_doinginval || bio_doingflush) {
 880                 bio_flinv_cv_wanted = 1;
 881                 cv_wait(&bio_flushinval_cv, &blist_lock);
 882         }
 883         bio_doingflush++;
 884         /*
 885          * Gather all B_DELWRI buffer for device.
 886          * Lock ordering is b_sem > hash lock (brelse).
 887          * Since we are finding the buffer via the delayed write list,
 888          * it may be busy and we would block trying to get the
 889          * b_sem lock while holding hash lock. So transfer all the
 890          * candidates on the delwri_list and then drop the hash locks.
 891          */
 892         for (i = 0; i < v.v_hbuf; i++) {
 893                 hmp = &hbuf[i].b_lock;
 894                 dp = (struct buf *)&dwbuf[i];
 895                 mutex_enter(hmp);
 896                 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
 897                         if (dev == NODEV || bp->b_edev == dev) {
 898                                 if (bp->b_list == NULL) {
 899                                         bp->b_list = delwri_list;
 900                                         delwri_list = bp;
 901                                 }
 902                         }
 903                 }
 904                 mutex_exit(hmp);
 905         }
 906         mutex_exit(&blist_lock);
 907
 908         /*
 909          * Now that the hash locks have been dropped grab the semaphores
 910          * and write back all the buffers that have B_DELWRI set.
 911          */
 912         while (delwri_list != EMPTY_LIST) {
 913                 bp = delwri_list;
 914
 915                 sema_p(&bp->b_sem);     /* may block */
 916                 if ((dev != bp->b_edev && dev != NODEV) ||
 917                     (panicstr && bp->b_flags & B_BUSY)) {
 918                         sema_v(&bp->b_sem);
 919                         delwri_list = bp->b_list;
 920                         bp->b_list = NULL;
 921                         continue;       /* No longer a candidate */
 922                 }
 923                 if (bp->b_flags & B_DELWRI) {
 924                         index = bio_bhash(bp->b_edev, bp->b_blkno);
 925                         hp = &hbuf[index];
 926                         hmp = &hp->b_lock;
 927                         dp = (struct buf *)hp;
 928
 929                         bp->b_flags |= B_ASYNC;
 930                         mutex_enter(hmp);
 931                         hp->b_length--;
 932                         notavail(bp);
 933                         mutex_exit(hmp);
 934                         if (bp->b_vp == NULL) {         /* !ufs */
 935                                 BWRITE(bp);
 936                         } else {                        /* ufs */
 937                                 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
 938                         }
 939                 } else {
 940                         sema_v(&bp->b_sem);
 941                 }
 942                 delwri_list = bp->b_list;
 943                 bp->b_list = NULL;
 944         }
 945         mutex_enter(&blist_lock);
 946         bio_doingflush--;
 947         if (bio_flinv_cv_wanted) {
 948                 bio_flinv_cv_wanted = 0;
 949                 cv_broadcast(&bio_flushinval_cv);
 950         }
 951         mutex_exit(&blist_lock);
 952 }
 953
 954 /*
 955  * Ensure that a specified block is up-to-date on disk.
 956  */
 957 void
 958 blkflush(dev_t dev, daddr_t blkno)
 959 {
 960         struct buf *bp, *dp;
 961         struct hbuf *hp;
 962         struct buf *sbp = NULL;
 963         uint_t index;
 964         kmutex_t *hmp;
 965
 966         index = bio_bhash(dev, blkno);
 967         hp    = &hbuf[index];
 968         dp    = (struct buf *)hp;
 969         hmp   = &hp->b_lock;
 970
 971         /*
 972          * Identify the buffer in the cache belonging to
 973          * this device and blkno (if any).
 974          */
 975         mutex_enter(hmp);
 976         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
 977                 if (bp->b_blkno != blkno || bp->b_edev != dev ||
 978                     (bp->b_flags & B_STALE))
 979                         continue;
 980                 sbp = bp;
 981                 break;
 982         }
 983         mutex_exit(hmp);
 984         if (sbp == NULL)
 985                 return;
 986         /*
 987          * Now check the buffer we have identified and
 988          * make sure it still belongs to the device and is B_DELWRI
 989          */
 990         sema_p(&sbp->b_sem);
 991         if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
 992             (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
 993                 mutex_enter(hmp);
 994                 hp->b_length--;
 995                 notavail(sbp);
 996                 mutex_exit(hmp);
 997                 /*
 998                  * XXX - There is nothing to guarantee a synchronous
 999                  * write here if the B_ASYNC flag is set.  This needs
1000                  * some investigation.
1001                  */
1002                 if (sbp->b_vp == NULL) {                /* !ufs */
1003                         BWRITE(sbp);    /* synchronous write */
1004                 } else {                                /* ufs */
1005                         UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1006                 }
1007         } else {
1008                 sema_v(&sbp->b_sem);
1009         }
1010 }
1011
1012 /*
1013  * Same as binval, except can force-invalidate delayed-write buffers
1014  * (which are not be already flushed because of device errors).  Also
1015  * makes sure that the retry write flag is cleared.
1016  */
1017 int
1018 bfinval(dev_t dev, int force)
1019 {
1020         struct buf *dp;
1021         struct buf *bp;
1022         struct buf *binval_list = EMPTY_LIST;
1023         int i, error = 0;
1024         kmutex_t *hmp;
1025         uint_t index;
1026         struct buf **backp;
1027
1028         mutex_enter(&blist_lock);
1029         /*
1030          * Wait for any flushes ahead of us to finish, it's ok to
1031          * do invalidates in parallel.
1032          */
1033         while (bio_doingflush) {
1034                 bio_flinv_cv_wanted = 1;
1035                 cv_wait(&bio_flushinval_cv, &blist_lock);
1036         }
1037         bio_doinginval++;
1038
1039         /* Gather bp's */
1040         for (i = 0; i < v.v_hbuf; i++) {
1041                 dp = (struct buf *)&hbuf[i];
1042                 hmp = &hbuf[i].b_lock;
1043
1044                 mutex_enter(hmp);
1045                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1046                         if (bp->b_edev == dev) {
1047                                 if (bp->b_list == NULL) {
1048                                         bp->b_list = binval_list;
1049                                         binval_list = bp;
1050                                 }
1051                         }
1052                 }
1053                 mutex_exit(hmp);
1054         }
1055         mutex_exit(&blist_lock);
1056
1057         /* Invalidate all bp's found */
1058         while (binval_list != EMPTY_LIST) {
1059                 bp = binval_list;
1060
1061                 sema_p(&bp->b_sem);
1062                 if (bp->b_edev == dev) {
1063                         if (force && (bp->b_flags & B_DELWRI)) {
1064                                 /* clear B_DELWRI, move to non-dw freelist */
1065                                 index = bio_bhash(bp->b_edev, bp->b_blkno);
1066                                 hmp = &hbuf[index].b_lock;
1067                                 dp = (struct buf *)&hbuf[index];
1068                                 mutex_enter(hmp);
1069
1070                                 /* remove from delayed write freelist */
1071                                 notavail(bp);
1072
1073                                 /* add to B_AGE side of non-dw freelist */
1074                                 backp = &dp->av_forw;
1075                                 (*backp)->av_back = bp;
1076                                 bp->av_forw = *backp;
1077                                 *backp = bp;
1078                                 bp->av_back = dp;
1079
1080                                 /*
1081                                  * make sure write retries and busy are cleared
1082                                  */
1083                                 bp->b_flags &=
1084                                     ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1085                                 mutex_exit(hmp);
1086                         }
1087                         if ((bp->b_flags & B_DELWRI) == 0)
1088                                 bp->b_flags |= B_STALE|B_AGE;
1089                         else
1090                                 error = EIO;
1091                 }
1092                 sema_v(&bp->b_sem);
1093                 binval_list = bp->b_list;
1094                 bp->b_list = NULL;
1095         }
1096         mutex_enter(&blist_lock);
1097         bio_doinginval--;
1098         if (bio_flinv_cv_wanted) {
1099                 cv_broadcast(&bio_flushinval_cv);
1100                 bio_flinv_cv_wanted = 0;
1101         }
1102         mutex_exit(&blist_lock);
1103         return (error);
1104 }
1105
1106 /*
1107  * If possible, invalidate blocks for a dev on demand
1108  */
1109 void
1110 binval(dev_t dev)
1111 {
1112         (void) bfinval(dev, 0);
1113 }
1114
1115 /*
1116  * Initialize the buffer I/O system by freeing
1117  * all buffers and setting all device hash buffer lists to empty.
1118  */
1119 void
1120 binit(void)
1121 {
1122         struct buf *bp;
1123         unsigned int i, pct;
1124         ulong_t bio_max_hwm, bio_default_hwm;
1125
1126         /*
1127          * Maximum/Default values for bufhwm are set to the smallest of:
1128          *      - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1129          *      - 1/4 of kernel virtual memory
1130          *      - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1131          * Additionally, in order to allow simple tuning by percentage of
1132          * physical memory, bufhwm_pct is used to calculate the default if
1133          * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1134          *
1135          * Since the unit for v.v_bufhwm is kilobytes, this allows for
1136          * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1137          */
1138         bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1139             btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1140         bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1141
1142         pct = BIO_BUF_PERCENT;
1143         if (bufhwm_pct != 0 &&
1144             ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1145                 pct = BIO_BUF_PERCENT;
1146                 /*
1147                  * Invalid user specified value, emit a warning.
1148                  */
1149                 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150                     range(1..%d). Using %d as default.",
1151                     bufhwm_pct,
1152                     100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1153         }
1154
1155         bio_default_hwm = MIN(physmem / pct,
1156             btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1157         bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1158
1159         if ((v.v_bufhwm = bufhwm) == 0)
1160                 v.v_bufhwm = bio_default_hwm;
1161
1162         if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1163                 v.v_bufhwm = (int)bio_max_hwm;
1164                 /*
1165                  * Invalid user specified value, emit a warning.
1166                  */
1167                 cmn_err(CE_WARN,
1168                     "binit: bufhwm(%d) out \
1169                     of range(%d..%lu). Using %lu as default",
1170                     bufhwm,
1171                     BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1172         }
1173
1174         /*
1175          * Determine the number of hash buckets. Default is to
1176          * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1177          * Round up number to the next power of 2.
1178          */
1179         v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1180             BIO_HASHLEN);
1181         v.v_hmask = v.v_hbuf - 1;
1182         v.v_buf = BIO_BHDR_POOL;
1183
1184         hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1185
1186         dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1187
1188         bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1189         bp = &bfreelist;
1190         bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1191
1192         for (i = 0; i < v.v_hbuf; i++) {
1193                 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1194                 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1195
1196                 /*
1197                  * Initialize the delayed write buffer list.
1198                  */
1199                 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1200                 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1201         }
1202 }
1203
1204 /*
1205  * Wait for I/O completion on the buffer; return error code.
1206  * If bp was for synchronous I/O, bp is invalid and associated
1207  * resources are freed on return.
1208  */
1209 int
1210 biowait(struct buf *bp)
1211 {
1212         int error = 0;
1213         struct cpu *cpup;
1214
1215         ASSERT(SEMA_HELD(&bp->b_sem));
1216
1217         cpup = CPU;
1218         atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1219         DTRACE_IO1(wait__start, struct buf *, bp);
1220
1221         /*
1222          * In case of panic, busy wait for completion
1223          */
1224         if (panicstr) {
1225                 while ((bp->b_flags & B_DONE) == 0)
1226                         drv_usecwait(10);
1227         } else
1228                 sema_p(&bp->b_io);
1229
1230         DTRACE_IO1(wait__done, struct buf *, bp);
1231         atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1232
1233         error = geterror(bp);
1234         if ((bp->b_flags & B_ASYNC) == 0) {
1235                 if (bp->b_flags & B_REMAPPED)
1236                         bp_mapout(bp);
1237         }
1238         return (error);
1239 }
1240
1241 static void
1242 biodone_tnf_probe(struct buf *bp)
1243 {
1244         /* Kernel probe */
1245         TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1246             tnf_device,         device,         bp->b_edev,
1247             tnf_diskaddr,       block,          bp->b_lblkno,
1248             tnf_opaque,         buf,            bp);
1249 }
1250
1251 /*
1252  * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1253  * and wake up anyone waiting for it.
1254  */
1255 void
1256 biodone(struct buf *bp)
1257 {
1258         if (bp->b_flags & B_STARTED) {
1259                 DTRACE_IO1(done, struct buf *, bp);
1260                 bp->b_flags &= ~B_STARTED;
1261         }
1262
1263         /*
1264          * Call the TNF probe here instead of the inline code
1265          * to force our compiler to use the tail call optimization.
1266          */
1267         biodone_tnf_probe(bp);
1268
1269         if (bp->b_iodone != NULL) {
1270                 (*(bp->b_iodone))(bp);
1271                 return;
1272         }
1273         ASSERT((bp->b_flags & B_DONE) == 0);
1274         ASSERT(SEMA_HELD(&bp->b_sem));
1275         bp->b_flags |= B_DONE;
1276         if (bp->b_flags & B_ASYNC) {
1277                 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1278                         bio_pageio_done(bp);
1279                 else
1280                         brelse(bp);     /* release bp to freelist */
1281         } else {
1282                 sema_v(&bp->b_io);
1283         }
1284 }
1285
1286 /*
1287  * Pick up the device's error number and pass it to the user;
1288  * if there is an error but the number is 0 set a generalized code.
1289  */
1290 int
1291 geterror(struct buf *bp)
1292 {
1293         int error = 0;
1294
1295         ASSERT(SEMA_HELD(&bp->b_sem));
1296         if (bp->b_flags & B_ERROR) {
1297                 error = bp->b_error;
1298                 if (!error)
1299                         error = EIO;
1300         }
1301         return (error);
1302 }
1303
1304 /*
1305  * Support for pageio buffers.
1306  *
1307  * This stuff should be generalized to provide a generalized bp
1308  * header facility that can be used for things other than pageio.
1309  */
1310
1311 /*
1312  * Allocate and initialize a buf struct for use with pageio.
1313  */
1314 struct buf *
1315 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1316 {
1317         struct buf *bp;
1318         struct cpu *cpup;
1319
1320         if (flags & B_READ) {
1321                 CPU_STATS_ENTER_K();
1322                 cpup = CPU;     /* get pointer AFTER preemption is disabled */
1323                 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1324                 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1325
1326                 atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1327
1328                 if ((flags & B_ASYNC) == 0) {
1329                         klwp_t *lwp = ttolwp(curthread);
1330                         if (lwp != NULL)
1331                                 lwp->lwp_ru.majflt++;
1332                         CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1333                         /* Kernel probe */
1334                         TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1335                             tnf_opaque,         vnode,          pp->p_vnode,
1336                             tnf_offset,         offset,         pp->p_offset);
1337                 }
1338                 /*
1339                  * Update statistics for pages being paged in
1340                  */
1341                 if (pp != NULL && pp->p_vnode != NULL) {
1342                         if (IS_SWAPFSVP(pp->p_vnode)) {
1343                                 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1344                                 atomic_add_64(&curzone->zone_anonpgin,
1345                                     btopr(len));
1346                         } else {
1347                                 if (pp->p_vnode->v_flag & VVMEXEC) {
1348                                         CPU_STATS_ADDQ(cpup, vm, execpgin,
1349                                             btopr(len));
1350                                         atomic_add_64(&curzone->zone_execpgin,
1351                                             btopr(len));
1352                                 } else {
1353                                         CPU_STATS_ADDQ(cpup, vm, fspgin,
1354                                             btopr(len));
1355                                         atomic_add_64(&curzone->zone_fspgin,
1356                                             btopr(len));
1357                                 }
1358                         }
1359                 }
1360                 CPU_STATS_EXIT_K();
1361                 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1362                     "page_ws_in:pp %p", pp);
1363                 /* Kernel probe */
1364                 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1365                     tnf_opaque, vnode,  pp->p_vnode,
1366                     tnf_offset, offset, pp->p_offset,
1367                     tnf_size,   size,   len);
1368         }
1369
1370         bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1371         bp->b_bcount = len;
1372         bp->b_bufsize = len;
1373         bp->b_pages = pp;
1374         bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1375         bp->b_offset = -1;
1376         sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1377
1378         /* Initialize bp->b_sem in "locked" state */
1379         sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1380
1381         VN_HOLD(vp);
1382         bp->b_vp = vp;
1383         THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1384
1385         /*
1386          * Caller sets dev & blkno and can adjust
1387          * b_addr for page offset and can use bp_mapin
1388          * to make pages kernel addressable.
1389          */
1390         return (bp);
1391 }
1392
1393 void
1394 pageio_done(struct buf *bp)
1395 {
1396         ASSERT(SEMA_HELD(&bp->b_sem));
1397         if (bp->b_flags & B_REMAPPED)
1398                 bp_mapout(bp);
1399         VN_RELE(bp->b_vp);
1400         bp->b_vp = NULL;
1401         ASSERT((bp->b_flags & B_NOCACHE) != 0);
1402
1403         /* A sema_v(bp->b_sem) is implied if we are destroying it */
1404         sema_destroy(&bp->b_sem);
1405         sema_destroy(&bp->b_io);
1406         kmem_free(bp, sizeof (struct buf));
1407 }
1408
1409 /*
1410  * Check to see whether the buffers, except the one pointed by sbp,
1411  * associated with the device are busy.
1412  * NOTE: This expensive operation shall be improved together with ufs_icheck().
1413  */
1414 int
1415 bcheck(dev_t dev, struct buf *sbp)
1416 {
1417         struct buf      *bp;
1418         struct buf      *dp;
1419         int i;
1420         kmutex_t *hmp;
1421
1422         /*
1423          * check for busy bufs for this filesystem
1424          */
1425         for (i = 0; i < v.v_hbuf; i++) {
1426                 dp = (struct buf *)&hbuf[i];
1427                 hmp = &hbuf[i].b_lock;
1428
1429                 mutex_enter(hmp);
1430                 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1431                         /*
1432                          * if buf is busy or dirty, then filesystem is busy
1433                          */
1434                         if ((bp->b_edev == dev) &&
1435                             ((bp->b_flags & B_STALE) == 0) &&
1436                             (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1437                             (bp != sbp)) {
1438                                 mutex_exit(hmp);
1439                                 return (1);
1440                         }
1441                 }
1442                 mutex_exit(hmp);
1443         }
1444         return (0);
1445 }
1446
1447 /*
1448  * Hash two 32 bit entities.
1449  */
1450 int
1451 hash2ints(int x, int y)
1452 {
1453         int hash = 0;
1454
1455         hash = x - 1;
1456         hash = ((hash * 7) + (x >> 8)) - 1;
1457         hash = ((hash * 7) + (x >> 16)) - 1;
1458         hash = ((hash * 7) + (x >> 24)) - 1;
1459         hash = ((hash * 7) + y) - 1;
1460         hash = ((hash * 7) + (y >> 8)) - 1;
1461         hash = ((hash * 7) + (y >> 16)) - 1;
1462         hash = ((hash * 7) + (y >> 24)) - 1;
1463
1464         return (hash);
1465 }
1466
1467
1468 /*
1469  * Return a new buffer struct.
1470  *      Create a new buffer if we haven't gone over our high water
1471  *      mark for memory, otherwise try to get one off the freelist.
1472  *
1473  * Returns a locked buf that has no id and is not on any hash or free
1474  * list.
1475  */
1476 static struct buf *
1477 bio_getfreeblk(long bsize)
1478 {
1479         struct buf *bp, *dp;
1480         struct hbuf *hp;
1481         kmutex_t        *hmp;
1482         uint_t          start, end;
1483
1484         /*
1485          * mutex_enter(&bfree_lock);
1486          * bfreelist.b_bufsize represents the amount of memory
1487          * mutex_exit(&bfree_lock); protect ref to bfreelist
1488          * we are allowed to allocate in the cache before we hit our hwm.
1489          */
1490         bio_mem_get(bsize);     /* Account for our memory request */
1491
1492 again:
1493         bp = bio_bhdr_alloc();  /* Get a buf hdr */
1494         sema_p(&bp->b_sem);     /* Should never fail */
1495
1496         ASSERT(bp->b_un.b_addr == NULL);
1497         bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1498         if (bp->b_un.b_addr != NULL) {
1499                 /*
1500                  * Make the common path short
1501                  */
1502                 bp->b_bufsize = bsize;
1503                 ASSERT(SEMA_HELD(&bp->b_sem));
1504                 return (bp);
1505         } else {
1506                 struct buf *save;
1507
1508                 save = bp;      /* Save bp we allocated */
1509                 start = end = lastindex;
1510
1511                 biostats.bio_bufwant.value.ui32++;
1512
1513                 /*
1514                  * Memory isn't available from the system now. Scan
1515                  * the hash buckets till enough space is found.
1516                  */
1517                 do {
1518                         hp = &hbuf[start];
1519                         hmp = &hp->b_lock;
1520                         dp = (struct buf *)hp;
1521
1522                         mutex_enter(hmp);
1523                         bp = dp->av_forw;
1524
1525                         while (bp != dp) {
1526
1527                                 ASSERT(bp != NULL);
1528
1529                                 if (!sema_tryp(&bp->b_sem)) {
1530                                         bp = bp->av_forw;
1531                                         continue;
1532                                 }
1533
1534                                 /*
1535                                  * Since we are going down the freelist
1536                                  * associated with this hash bucket the
1537                                  * B_DELWRI flag should not be set.
1538                                  */
1539                                 ASSERT(!(bp->b_flags & B_DELWRI));
1540
1541                                 if (bp->b_bufsize == bsize) {
1542                                         hp->b_length--;
1543                                         notavail(bp);
1544                                         bremhash(bp);
1545                                         mutex_exit(hmp);
1546
1547                                         /*
1548                                          * Didn't kmem_alloc any more, so don't
1549                                          * count it twice.
1550                                          */
1551                                         mutex_enter(&bfree_lock);
1552                                         bfreelist.b_bufsize += bsize;
1553                                         mutex_exit(&bfree_lock);
1554
1555                                         /*
1556                                          * Update the lastindex value.
1557                                          */
1558                                         lastindex = start;
1559
1560                                         /*
1561                                          * Put our saved bp back on the list
1562                                          */
1563                                         sema_v(&save->b_sem);
1564                                         bio_bhdr_free(save);
1565                                         ASSERT(SEMA_HELD(&bp->b_sem));
1566                                         return (bp);
1567                                 }
1568                                 sema_v(&bp->b_sem);
1569                                 bp = bp->av_forw;
1570                         }
1571                         mutex_exit(hmp);
1572                         start = ((start + 1) % v.v_hbuf);
1573                 } while (start != end);
1574
1575                 biostats.bio_bufwait.value.ui32++;
1576                 bp = save;              /* Use original bp */
1577                 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1578         }
1579
1580         bp->b_bufsize = bsize;
1581         ASSERT(SEMA_HELD(&bp->b_sem));
1582         return (bp);
1583 }
1584
1585 /*
1586  * Allocate a buffer header. If none currently available, allocate
1587  * a new pool.
1588  */
1589 static struct buf *
1590 bio_bhdr_alloc(void)
1591 {
1592         struct buf *dp, *sdp;
1593         struct buf *bp;
1594         int i;
1595
1596         for (;;) {
1597                 mutex_enter(&bhdr_lock);
1598                 if (bhdrlist != NULL) {
1599                         bp = bhdrlist;
1600                         bhdrlist = bp->av_forw;
1601                         mutex_exit(&bhdr_lock);
1602                         bp->av_forw = NULL;
1603                         return (bp);
1604                 }
1605                 mutex_exit(&bhdr_lock);
1606
1607                 /*
1608                  * Need to allocate a new pool. If the system is currently
1609                  * out of memory, then try freeing things on the freelist.
1610                  */
1611                 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1612                 if (dp == NULL) {
1613                         /*
1614                          * System can't give us a pool of headers, try
1615                          * recycling from the free lists.
1616                          */
1617                         bio_recycle(BIO_HEADER, 0);
1618                 } else {
1619                         sdp = dp;
1620                         for (i = 0; i < v.v_buf; i++, dp++) {
1621                                 /*
1622                                  * The next two lines are needed since NODEV
1623                                  * is -1 and not NULL
1624                                  */
1625                                 dp->b_dev = (o_dev_t)NODEV;
1626                                 dp->b_edev = NODEV;
1627                                 dp->av_forw = dp + 1;
1628                                 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1629                                     NULL);
1630                                 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1631                                     NULL);
1632                                 dp->b_offset = -1;
1633                         }
1634                         mutex_enter(&bhdr_lock);
1635                         (--dp)->av_forw = bhdrlist;     /* Fix last pointer */
1636                         bhdrlist = sdp;
1637                         nbuf += v.v_buf;
1638                         bp = bhdrlist;
1639                         bhdrlist = bp->av_forw;
1640                         mutex_exit(&bhdr_lock);
1641
1642                         bp->av_forw = NULL;
1643                         return (bp);
1644                 }
1645         }
1646 }
1647
1648 static  void
1649 bio_bhdr_free(struct buf *bp)
1650 {
1651         ASSERT(bp->b_back == NULL);
1652         ASSERT(bp->b_forw == NULL);
1653         ASSERT(bp->av_back == NULL);
1654         ASSERT(bp->av_forw == NULL);
1655         ASSERT(bp->b_un.b_addr == NULL);
1656         ASSERT(bp->b_dev == (o_dev_t)NODEV);
1657         ASSERT(bp->b_edev == NODEV);
1658         ASSERT(bp->b_flags == 0);
1659
1660         mutex_enter(&bhdr_lock);
1661         bp->av_forw = bhdrlist;
1662         bhdrlist = bp;
1663         mutex_exit(&bhdr_lock);
1664 }
1665
1666 /*
1667  * If we haven't gone over the high water mark, it's o.k. to
1668  * allocate more buffer space, otherwise recycle buffers
1669  * from the freelist until enough memory is free for a bsize request.
1670  *
1671  * We account for this memory, even though
1672  * we don't allocate it here.
1673  */
1674 static void
1675 bio_mem_get(long bsize)
1676 {
1677         mutex_enter(&bfree_lock);
1678         if (bfreelist.b_bufsize > bsize) {
1679                 bfreelist.b_bufsize -= bsize;
1680                 mutex_exit(&bfree_lock);
1681                 return;
1682         }
1683         mutex_exit(&bfree_lock);
1684         bio_recycle(BIO_MEM, bsize);
1685 }
1686
1687 /*
1688  * flush a list of delayed write buffers.
1689  * (currently used only by bio_recycle below.)
1690  */
1691 static void
1692 bio_flushlist(struct buf *delwri_list)
1693 {
1694         struct buf *bp;
1695
1696         while (delwri_list != EMPTY_LIST) {
1697                 bp = delwri_list;
1698                 bp->b_flags |= B_AGE | B_ASYNC;
1699                 if (bp->b_vp == NULL) {         /* !ufs */
1700                         BWRITE(bp);
1701                 } else {                        /* ufs */
1702                         UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1703                 }
1704                 delwri_list = bp->b_list;
1705                 bp->b_list = NULL;
1706         }
1707 }
1708
1709 /*
1710  * Start recycling buffers on the freelist for one of 2 reasons:
1711  *      - we need a buffer header
1712  *      - we need to free up memory
1713  * Once started we continue to recycle buffers until the B_AGE
1714  * buffers are gone.
1715  */
1716 static void
1717 bio_recycle(int want, long bsize)
1718 {
1719         struct buf *bp, *dp, *dwp, *nbp;
1720         struct hbuf *hp;
1721         int     found = 0;
1722         kmutex_t        *hmp;
1723         int             start, end;
1724         struct buf *delwri_list = EMPTY_LIST;
1725
1726         /*
1727          * Recycle buffers.
1728          */
1729 top:
1730         start = end = lastindex;
1731         do {
1732                 hp = &hbuf[start];
1733                 hmp = &hp->b_lock;
1734                 dp = (struct buf *)hp;
1735
1736                 mutex_enter(hmp);
1737                 bp = dp->av_forw;
1738
1739                 while (bp != dp) {
1740
1741                         ASSERT(bp != NULL);
1742
1743                         if (!sema_tryp(&bp->b_sem)) {
1744                                 bp = bp->av_forw;
1745                                 continue;
1746                         }
1747                         /*
1748                          * Do we really want to nuke all of the B_AGE stuff??
1749                          */
1750                         if ((bp->b_flags & B_AGE) == 0 && found) {
1751                                 sema_v(&bp->b_sem);
1752                                 mutex_exit(hmp);
1753                                 lastindex = start;
1754                                 return; /* All done */
1755                         }
1756
1757                         ASSERT(MUTEX_HELD(&hp->b_lock));
1758                         ASSERT(!(bp->b_flags & B_DELWRI));
1759                         hp->b_length--;
1760                         notavail(bp);
1761
1762                         /*
1763                          * Remove bhdr from cache, free up memory,
1764                          * and add the hdr to the freelist.
1765                          */
1766                         bremhash(bp);
1767                         mutex_exit(hmp);
1768
1769                         if (bp->b_bufsize) {
1770                                 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1771                                 bp->b_un.b_addr = NULL;
1772                                 mutex_enter(&bfree_lock);
1773                                 bfreelist.b_bufsize += bp->b_bufsize;
1774                                 mutex_exit(&bfree_lock);
1775                         }
1776
1777                         bp->b_dev = (o_dev_t)NODEV;
1778                         bp->b_edev = NODEV;
1779                         bp->b_flags = 0;
1780                         sema_v(&bp->b_sem);
1781                         bio_bhdr_free(bp);
1782                         if (want == BIO_HEADER) {
1783                                 found = 1;
1784                         } else {
1785                                 ASSERT(want == BIO_MEM);
1786                                 if (!found && bfreelist.b_bufsize >= bsize) {
1787                                         /* Account for the memory we want */
1788                                         mutex_enter(&bfree_lock);
1789                                         if (bfreelist.b_bufsize >= bsize) {
1790                                                 bfreelist.b_bufsize -= bsize;
1791                                                 found = 1;
1792                                         }
1793                                         mutex_exit(&bfree_lock);
1794                                 }
1795                         }
1796
1797                         /*
1798                          * Since we dropped hmp start from the
1799                          * begining.
1800                          */
1801                         mutex_enter(hmp);
1802                         bp = dp->av_forw;
1803                 }
1804                 mutex_exit(hmp);
1805
1806                 /*
1807                  * Look at the delayed write list.
1808                  * First gather into a private list, then write them.
1809                  */
1810                 dwp = (struct buf *)&dwbuf[start];
1811                 mutex_enter(&blist_lock);
1812                 bio_doingflush++;
1813                 mutex_enter(hmp);
1814                 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1815
1816                         ASSERT(bp != NULL);
1817                         nbp = bp->av_forw;
1818
1819                         if (!sema_tryp(&bp->b_sem))
1820                                 continue;
1821                         ASSERT(bp->b_flags & B_DELWRI);
1822                         /*
1823                          * Do we really want to nuke all of the B_AGE stuff??
1824                          */
1825
1826                         if ((bp->b_flags & B_AGE) == 0 && found) {
1827                                 sema_v(&bp->b_sem);
1828                                 mutex_exit(hmp);
1829                                 lastindex = start;
1830                                 mutex_exit(&blist_lock);
1831                                 bio_flushlist(delwri_list);
1832                                 mutex_enter(&blist_lock);
1833                                 bio_doingflush--;
1834                                 if (bio_flinv_cv_wanted) {
1835                                         bio_flinv_cv_wanted = 0;
1836                                         cv_broadcast(&bio_flushinval_cv);
1837                                 }
1838                                 mutex_exit(&blist_lock);
1839                                 return; /* All done */
1840                         }
1841
1842                         /*
1843                          * If the buffer is already on a flush or
1844                          * invalidate list then just skip it.
1845                          */
1846                         if (bp->b_list != NULL) {
1847                                 sema_v(&bp->b_sem);
1848                                 continue;
1849                         }
1850                         /*
1851                          * We are still on the same bucket.
1852                          */
1853                         hp->b_length--;
1854                         notavail(bp);
1855                         bp->b_list = delwri_list;
1856                         delwri_list = bp;
1857                 }
1858                 mutex_exit(hmp);
1859                 mutex_exit(&blist_lock);
1860                 bio_flushlist(delwri_list);
1861                 delwri_list = EMPTY_LIST;
1862                 mutex_enter(&blist_lock);
1863                 bio_doingflush--;
1864                 if (bio_flinv_cv_wanted) {
1865                         bio_flinv_cv_wanted = 0;
1866                         cv_broadcast(&bio_flushinval_cv);
1867                 }
1868                 mutex_exit(&blist_lock);
1869                 start = (start + 1) % v.v_hbuf;
1870
1871         } while (start != end);
1872
1873         if (found)
1874                 return;
1875
1876         /*
1877          * Free lists exhausted and we haven't satisfied the request.
1878          * Wait here for more entries to be added to freelist.
1879          * Because this might have just happened, make it timed.
1880          */
1881         mutex_enter(&bfree_lock);
1882         bfreelist.b_flags |= B_WANTED;
1883         (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1884         mutex_exit(&bfree_lock);
1885         goto top;
1886 }
1887
1888 /*
1889  * See if the block is associated with some buffer
1890  * (mainly to avoid getting hung up on a wait in breada).
1891  */
1892 static int
1893 bio_incore(dev_t dev, daddr_t blkno)
1894 {
1895         struct buf *bp;
1896         struct buf *dp;
1897         uint_t index;
1898         kmutex_t *hmp;
1899
1900         index = bio_bhash(dev, blkno);
1901         dp = (struct buf *)&hbuf[index];
1902         hmp = &hbuf[index].b_lock;
1903
1904         mutex_enter(hmp);
1905         for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1906                 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1907                     (bp->b_flags & B_STALE) == 0) {
1908                         mutex_exit(hmp);
1909                         return (1);
1910                 }
1911         }
1912         mutex_exit(hmp);
1913         return (0);
1914 }
1915
1916 static void
1917 bio_pageio_done(struct buf *bp)
1918 {
1919         if (bp->b_flags & B_PAGEIO) {
1920
1921                 if (bp->b_flags & B_REMAPPED)
1922                         bp_mapout(bp);
1923
1924                 if (bp->b_flags & B_READ)
1925                         pvn_read_done(bp->b_pages, bp->b_flags);
1926                 else
1927                         pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1928                 pageio_done(bp);
1929         } else {
1930                 ASSERT(bp->b_flags & B_REMAPPED);
1931                 bp_mapout(bp);
1932                 brelse(bp);
1933         }
1934 }
1935
1936 /*
1937  * bioerror(9F) - indicate error in buffer header
1938  * If 'error' is zero, remove the error indication.
1939  */
1940 void
1941 bioerror(struct buf *bp, int error)
1942 {
1943         ASSERT(bp != NULL);
1944         ASSERT(error >= 0);
1945         ASSERT(SEMA_HELD(&bp->b_sem));
1946
1947         if (error != 0) {
1948                 bp->b_flags |= B_ERROR;
1949         } else {
1950                 bp->b_flags &= ~B_ERROR;
1951         }
1952         bp->b_error = error;
1953 }
1954
1955 /*
1956  * bioreset(9F) - reuse a private buffer header after I/O is complete
1957  */
1958 void
1959 bioreset(struct buf *bp)
1960 {
1961         ASSERT(bp != NULL);
1962
1963         biofini(bp);
1964         bioinit(bp);
1965 }
1966
1967 /*
1968  * biosize(9F) - return size of a buffer header
1969  */
1970 size_t
1971 biosize(void)
1972 {
1973         return (sizeof (struct buf));
1974 }
1975
1976 /*
1977  * biomodified(9F) - check if buffer is modified
1978  */
1979 int
1980 biomodified(struct buf *bp)
1981 {
1982         int npf;
1983         int ppattr;
1984         struct page *pp;
1985
1986         ASSERT(bp != NULL);
1987
1988         if ((bp->b_flags & B_PAGEIO) == 0) {
1989                 return (-1);
1990         }
1991         pp = bp->b_pages;
1992         npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1993
1994         while (npf > 0) {
1995                 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1996                     HAT_SYNC_STOPON_MOD);
1997                 if (ppattr & P_MOD)
1998                         return (1);
1999                 pp = pp->p_next;
2000                 npf--;
2001         }
2002
2003         return (0);
2004 }
2005
2006 /*
2007  * bioinit(9F) - initialize a buffer structure
2008  */
2009 void
2010 bioinit(struct buf *bp)
2011 {
2012         bzero(bp, sizeof (struct buf));
2013         sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2014         sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2015         bp->b_offset = -1;
2016 }
2017
2018 /*
2019  * biofini(9F) - uninitialize a buffer structure
2020  */
2021 void
2022 biofini(struct buf *bp)
2023 {
2024         sema_destroy(&bp->b_io);
2025         sema_destroy(&bp->b_sem);
2026 }
2027
2028 /*
2029  * bioclone(9F) - clone a buffer
2030  */
2031 struct buf *
2032 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2033     int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2034 {
2035         struct buf *bufp;
2036
2037         ASSERT(bp);
2038         if (bp_mem == NULL) {
2039                 bufp = kmem_alloc(sizeof (struct buf), sleep);
2040                 if (bufp == NULL) {
2041                         return (NULL);
2042                 }
2043                 bioinit(bufp);
2044         } else {
2045                 bufp = bp_mem;
2046                 bioreset(bufp);
2047         }
2048
2049 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2050         B_ABRWRITE)
2051
2052         /*
2053          * The cloned buffer does not inherit the B_REMAPPED flag.
2054          */
2055         bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS)  | B_BUSY;
2056         bufp->b_bcount = len;
2057         bufp->b_blkno = blkno;
2058         bufp->b_iodone = iodone;
2059         bufp->b_proc = bp->b_proc;
2060         bufp->b_edev = dev;
2061         bufp->b_file = bp->b_file;
2062         bufp->b_offset = bp->b_offset;
2063
2064         if (bp->b_flags & B_SHADOW) {
2065                 ASSERT(bp->b_shadow);
2066                 ASSERT(bp->b_flags & B_PHYS);
2067
2068                 bufp->b_shadow = bp->b_shadow +
2069                     btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2070                 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2071                 if (bp->b_flags & B_REMAPPED)
2072                         bufp->b_proc = NULL;
2073         } else {
2074                 if (bp->b_flags & B_PAGEIO) {
2075                         struct page *pp;
2076                         off_t o;
2077                         int i;
2078
2079                         pp = bp->b_pages;
2080                         o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2081                         for (i = btop(o); i > 0; i--) {
2082                                 pp = pp->p_next;
2083                         }
2084                         bufp->b_pages = pp;
2085                         bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2086                 } else {
2087                         bufp->b_un.b_addr =
2088                             (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2089                         if (bp->b_flags & B_REMAPPED)
2090                                 bufp->b_proc = NULL;
2091                 }
2092         }
2093         return (bufp);
2094 }