8702 PCI addresses with physaddr > 0xffffffff can't be mapped in
[unleashed.git] / usr / src / uts / common / os / bio.c
blob81d26a477791b8e275546daf585e49e6f8441d71
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2011 Joyent, Inc. All rights reserved.
28 * Copyright (c) 2016 by Delphix. All rights reserved.
31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
32 /* All Rights Reserved */
35 * University Copyright- Copyright (c) 1982, 1986, 1988
36 * The Regents of the University of California
37 * All Rights Reserved
39 * University Acknowledgment- Portions of this document are derived from
40 * software developed by the University of California, Berkeley, and its
41 * contributors.
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/sysmacros.h>
47 #include <sys/conf.h>
48 #include <sys/cpuvar.h>
49 #include <sys/errno.h>
50 #include <sys/debug.h>
51 #include <sys/buf.h>
52 #include <sys/var.h>
53 #include <sys/vnode.h>
54 #include <sys/bitmap.h>
55 #include <sys/cmn_err.h>
56 #include <sys/kmem.h>
57 #include <sys/vmem.h>
58 #include <sys/atomic.h>
59 #include <vm/seg_kmem.h>
60 #include <vm/page.h>
61 #include <vm/pvn.h>
62 #include <sys/vtrace.h>
63 #include <sys/tnf_probe.h>
64 #include <sys/fs/ufs_inode.h>
65 #include <sys/fs/ufs_bio.h>
66 #include <sys/fs/ufs_log.h>
67 #include <sys/systm.h>
68 #include <sys/vfs.h>
69 #include <sys/sdt.h>
71 /* Locks */
72 static kmutex_t blist_lock; /* protects b_list */
73 static kmutex_t bhdr_lock; /* protects the bhdrlist */
74 static kmutex_t bfree_lock; /* protects the bfreelist structure */
76 struct hbuf *hbuf; /* Hash buckets */
77 struct dwbuf *dwbuf; /* Delayed write buckets */
78 static struct buf *bhdrlist; /* buf header free list */
79 static int nbuf; /* number of buffer headers allocated */
81 static int lastindex; /* Reference point on where to start */
82 /* when looking for free buffers */
84 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
85 #define EMPTY_LIST ((struct buf *)-1)
87 static kcondvar_t bio_mem_cv; /* Condition variables */
88 static kcondvar_t bio_flushinval_cv;
89 static int bio_doingflush; /* flush in progress */
90 static int bio_doinginval; /* inval in progress */
91 static int bio_flinv_cv_wanted; /* someone waiting for cv */
94 * Statistics on the buffer cache
96 struct biostats biostats = {
97 { "buffer_cache_lookups", KSTAT_DATA_UINT32 },
98 { "buffer_cache_hits", KSTAT_DATA_UINT32 },
99 { "new_buffer_requests", KSTAT_DATA_UINT32 },
100 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 },
101 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 },
102 { "duplicate_buffers_found", KSTAT_DATA_UINT32 }
106 * kstat data
108 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats;
109 uint_t biostats_ndata = (uint_t)(sizeof (biostats) /
110 sizeof (kstat_named_t));
113 * Statistics on ufs buffer cache
114 * Not protected by locks
116 struct ufsbiostats ub = {
117 { "breads", KSTAT_DATA_UINT32 },
118 { "bwrites", KSTAT_DATA_UINT32 },
119 { "fbiwrites", KSTAT_DATA_UINT32 },
120 { "getpages", KSTAT_DATA_UINT32 },
121 { "getras", KSTAT_DATA_UINT32 },
122 { "putsyncs", KSTAT_DATA_UINT32 },
123 { "putasyncs", KSTAT_DATA_UINT32 },
124 { "putpageios", KSTAT_DATA_UINT32 },
128 * more UFS Logging eccentricities...
130 * required since "#pragma weak ..." doesn't work in reverse order.
131 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
132 * to ufs routines don't get plugged into bio.c calls so
133 * we initialize it when setting up the "lufsops" table
134 * in "lufs.c:_init()"
136 void (*bio_lufs_strategy)(void *, buf_t *);
137 void (*bio_snapshot_strategy)(void *, buf_t *);
140 /* Private routines */
141 static struct buf *bio_getfreeblk(long);
142 static void bio_mem_get(long);
143 static void bio_bhdr_free(struct buf *);
144 static struct buf *bio_bhdr_alloc(void);
145 static void bio_recycle(int, long);
146 static void bio_pageio_done(struct buf *);
147 static int bio_incore(dev_t, daddr_t);
150 * Buffer cache constants
152 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
153 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
154 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
155 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
156 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
157 #define BIO_HASHLEN 4 /* Target length of hash chains */
160 /* Flags for bio_recycle() */
161 #define BIO_HEADER 0x01
162 #define BIO_MEM 0x02
164 extern int bufhwm; /* User tunable - high water mark for mem */
165 extern int bufhwm_pct; /* ditto - given in % of physmem */
168 * The following routines allocate and free
169 * buffers with various side effects. In general the
170 * arguments to an allocate routine are a device and
171 * a block number, and the value is a pointer to
172 * to the buffer header; the buffer returned is locked with a
173 * binary semaphore so that no one else can touch it. If the block was
174 * already in core, no I/O need be done; if it is
175 * already locked, the process waits until it becomes free.
176 * The following routines allocate a buffer:
177 * getblk
178 * bread/BREAD
179 * breada
180 * Eventually the buffer must be released, possibly with the
181 * side effect of writing it out, by using one of
182 * bwrite/BWRITE/brwrite
183 * bdwrite/bdrwrite
184 * bawrite
185 * brelse
187 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
188 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
189 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
190 * B_DONE is still used to denote a buffer with I/O complete on it.
192 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
193 * should not be used where a very accurate count of the free buffers is
194 * needed.
198 * Read in (if necessary) the block and return a buffer pointer.
200 * This interface is provided for binary compatibility. Using
201 * BREAD() directly avoids the extra function call overhead invoked
202 * by calling this routine.
204 struct buf *
205 bread(dev_t dev, daddr_t blkno, long bsize)
207 return (BREAD(dev, blkno, bsize));
211 * Common code for reading a buffer with various options
213 * Read in (if necessary) the block and return a buffer pointer.
215 struct buf *
216 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
218 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
219 struct buf *bp;
220 klwp_t *lwp = ttolwp(curthread);
222 CPU_STATS_ADD_K(sys, lread, 1);
223 bp = getblk_common(ufsvfsp, dev, blkno, bsize, /* errflg */ 1);
224 if (bp->b_flags & B_DONE)
225 return (bp);
226 bp->b_flags |= B_READ;
227 ASSERT(bp->b_bcount == bsize);
228 if (ufsvfsp == NULL) { /* !ufs */
229 (void) bdev_strategy(bp);
230 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
231 /* ufs && logging */
232 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
233 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
234 /* ufs && snapshots */
235 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
236 } else {
237 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
238 ub.ub_breads.value.ul++; /* ufs && !logging */
239 (void) bdev_strategy(bp);
241 if (lwp != NULL)
242 lwp->lwp_ru.inblock++;
243 CPU_STATS_ADD_K(sys, bread, 1);
244 (void) biowait(bp);
245 return (bp);
249 * Read in the block, like bread, but also start I/O on the
250 * read-ahead block (which is not allocated to the caller).
252 struct buf *
253 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
255 struct buf *bp, *rabp;
256 klwp_t *lwp = ttolwp(curthread);
258 bp = NULL;
259 if (!bio_incore(dev, blkno)) {
260 CPU_STATS_ADD_K(sys, lread, 1);
261 bp = GETBLK(dev, blkno, bsize);
262 if ((bp->b_flags & B_DONE) == 0) {
263 bp->b_flags |= B_READ;
264 bp->b_bcount = bsize;
265 (void) bdev_strategy(bp);
266 if (lwp != NULL)
267 lwp->lwp_ru.inblock++;
268 CPU_STATS_ADD_K(sys, bread, 1);
271 if (rablkno && bfreelist.b_bcount > 1 &&
272 !bio_incore(dev, rablkno)) {
273 rabp = GETBLK(dev, rablkno, bsize);
274 if (rabp->b_flags & B_DONE)
275 brelse(rabp);
276 else {
277 rabp->b_flags |= B_READ|B_ASYNC;
278 rabp->b_bcount = bsize;
279 (void) bdev_strategy(rabp);
280 if (lwp != NULL)
281 lwp->lwp_ru.inblock++;
282 CPU_STATS_ADD_K(sys, bread, 1);
285 if (bp == NULL)
286 return (BREAD(dev, blkno, bsize));
287 (void) biowait(bp);
288 return (bp);
292 * Common code for writing a buffer with various options.
294 * force_wait - wait for write completion regardless of B_ASYNC flag
295 * do_relse - release the buffer when we are done
296 * clear_flags - flags to clear from the buffer
298 void
299 bwrite_common(void *arg, struct buf *bp, int force_wait,
300 int do_relse, int clear_flags)
302 register int do_wait;
303 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
304 int flag;
305 klwp_t *lwp = ttolwp(curthread);
306 struct cpu *cpup;
308 ASSERT(SEMA_HELD(&bp->b_sem));
309 flag = bp->b_flags;
310 bp->b_flags &= ~clear_flags;
311 if (lwp != NULL)
312 lwp->lwp_ru.oublock++;
313 CPU_STATS_ENTER_K();
314 cpup = CPU; /* get pointer AFTER preemption is disabled */
315 CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
316 CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
317 do_wait = ((flag & B_ASYNC) == 0 || force_wait);
318 if (do_wait == 0)
319 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
320 CPU_STATS_EXIT_K();
321 if (ufsvfsp == NULL) {
322 (void) bdev_strategy(bp);
323 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
324 /* ufs && logging */
325 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
326 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
327 /* ufs && snapshots */
328 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
329 } else {
330 ub.ub_bwrites.value.ul++; /* ufs && !logging */
331 (void) bdev_strategy(bp);
333 if (do_wait) {
334 (void) biowait(bp);
335 if (do_relse) {
336 brelse(bp);
342 * Write the buffer, waiting for completion (unless B_ASYNC is set).
343 * Then release the buffer.
344 * This interface is provided for binary compatibility. Using
345 * BWRITE() directly avoids the extra function call overhead invoked
346 * by calling this routine.
348 void
349 bwrite(struct buf *bp)
351 BWRITE(bp);
355 * Write the buffer, waiting for completion.
356 * But don't release the buffer afterwards.
357 * This interface is provided for binary compatibility. Using
358 * BWRITE2() directly avoids the extra function call overhead.
360 void
361 bwrite2(struct buf *bp)
363 BWRITE2(bp);
367 * Release the buffer, marking it so that if it is grabbed
368 * for another purpose it will be written out before being
369 * given up (e.g. when writing a partial block where it is
370 * assumed that another write for the same block will soon follow).
371 * Also save the time that the block is first marked as delayed
372 * so that it will be written in a reasonable time.
374 void
375 bdwrite(struct buf *bp)
377 ASSERT(SEMA_HELD(&bp->b_sem));
378 CPU_STATS_ADD_K(sys, lwrite, 1);
379 if ((bp->b_flags & B_DELWRI) == 0)
380 bp->b_start = ddi_get_lbolt();
382 * B_DONE allows others to use the buffer, B_DELWRI causes the
383 * buffer to be written before being reused, and setting b_resid
384 * to zero says the buffer is complete.
386 bp->b_flags |= B_DELWRI | B_DONE;
387 bp->b_resid = 0;
388 brelse(bp);
392 * Release the buffer, start I/O on it, but don't wait for completion.
394 void
395 bawrite(struct buf *bp)
397 ASSERT(SEMA_HELD(&bp->b_sem));
399 /* Use bfreelist.b_bcount as a weird-ass heuristic */
400 if (bfreelist.b_bcount > 4)
401 bp->b_flags |= B_ASYNC;
402 BWRITE(bp);
406 * Release the buffer, with no I/O implied.
408 void
409 brelse(struct buf *bp)
411 struct buf **backp;
412 uint_t index;
413 kmutex_t *hmp;
414 struct buf *dp;
415 struct hbuf *hp;
418 ASSERT(SEMA_HELD(&bp->b_sem));
421 * Clear the retry write flag if the buffer was written without
422 * error. The presence of B_DELWRI means the buffer has not yet
423 * been written and the presence of B_ERROR means that an error
424 * is still occurring.
426 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
427 bp->b_flags &= ~B_RETRYWRI;
430 /* Check for anomalous conditions */
431 if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
432 if (bp->b_flags & B_NOCACHE) {
433 /* Don't add to the freelist. Destroy it now */
434 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
435 sema_destroy(&bp->b_sem);
436 sema_destroy(&bp->b_io);
437 kmem_free(bp, sizeof (struct buf));
438 return;
441 * If a write failed and we are supposed to retry write,
442 * don't toss the buffer. Keep it around and mark it
443 * delayed write in the hopes that it will eventually
444 * get flushed (and still keep the system running.)
446 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
447 bp->b_flags |= B_DELWRI;
448 /* keep fsflush from trying continuously to flush */
449 bp->b_start = ddi_get_lbolt();
450 } else
451 bp->b_flags |= B_AGE|B_STALE;
452 bp->b_flags &= ~B_ERROR;
453 bp->b_error = 0;
457 * If delayed write is set then put in on the delayed
458 * write list instead of the free buffer list.
460 index = bio_bhash(bp->b_edev, bp->b_blkno);
461 hmp = &hbuf[index].b_lock;
463 mutex_enter(hmp);
464 hp = &hbuf[index];
465 dp = (struct buf *)hp;
468 * Make sure that the number of entries on this list are
469 * Zero <= count <= total # buffers
471 ASSERT(hp->b_length >= 0);
472 ASSERT(hp->b_length < nbuf);
474 hp->b_length++; /* We are adding this buffer */
476 if (bp->b_flags & B_DELWRI) {
478 * This buffer goes on the delayed write buffer list
480 dp = (struct buf *)&dwbuf[index];
482 ASSERT(bp->b_bufsize > 0);
483 ASSERT(bp->b_bcount > 0);
484 ASSERT(bp->b_un.b_addr != NULL);
486 if (bp->b_flags & B_AGE) {
487 backp = &dp->av_forw;
488 (*backp)->av_back = bp;
489 bp->av_forw = *backp;
490 *backp = bp;
491 bp->av_back = dp;
492 } else {
493 backp = &dp->av_back;
494 (*backp)->av_forw = bp;
495 bp->av_back = *backp;
496 *backp = bp;
497 bp->av_forw = dp;
499 mutex_exit(hmp);
501 if (bfreelist.b_flags & B_WANTED) {
503 * Should come here very very rarely.
505 mutex_enter(&bfree_lock);
506 if (bfreelist.b_flags & B_WANTED) {
507 bfreelist.b_flags &= ~B_WANTED;
508 cv_broadcast(&bio_mem_cv);
510 mutex_exit(&bfree_lock);
513 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
515 * Don't let anyone get the buffer off the freelist before we
516 * release our hold on it.
518 sema_v(&bp->b_sem);
522 * Return a count of the number of B_BUSY buffers in the system
523 * Can only be used as a good estimate. If 'cleanit' is set,
524 * try to flush all bufs.
527 bio_busy(int cleanit)
529 struct buf *bp, *dp;
530 int busy = 0;
531 int i;
532 kmutex_t *hmp;
534 for (i = 0; i < v.v_hbuf; i++) {
535 dp = (struct buf *)&hbuf[i];
536 hmp = &hbuf[i].b_lock;
538 mutex_enter(hmp);
539 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
540 if (bp->b_flags & B_BUSY)
541 busy++;
543 mutex_exit(hmp);
546 if (cleanit && busy != 0) {
547 bflush(NODEV);
550 return (busy);
554 * this interface is provided for binary compatibility.
556 * Assign a buffer for the given block. If the appropriate
557 * block is already associated, return it; otherwise search
558 * for the oldest non-busy buffer and reassign it.
560 struct buf *
561 getblk(dev_t dev, daddr_t blkno, long bsize)
563 return (getblk_common(/* ufsvfsp */ NULL, dev,
564 blkno, bsize, /* errflg */ 0));
568 * Assign a buffer for the given block. If the appropriate
569 * block is already associated, return it; otherwise search
570 * for the oldest non-busy buffer and reassign it.
572 struct buf *
573 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, int errflg)
575 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
576 struct buf *bp;
577 struct buf *dp;
578 struct buf *nbp = NULL;
579 struct buf *errbp;
580 uint_t index;
581 kmutex_t *hmp;
582 struct hbuf *hp;
584 if (getmajor(dev) >= devcnt)
585 cmn_err(CE_PANIC, "blkdev");
587 biostats.bio_lookup.value.ui32++;
589 index = bio_bhash(dev, blkno);
590 hp = &hbuf[index];
591 dp = (struct buf *)hp;
592 hmp = &hp->b_lock;
594 mutex_enter(hmp);
595 loop:
596 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
597 if (bp->b_blkno != blkno || bp->b_edev != dev ||
598 (bp->b_flags & B_STALE))
599 continue;
601 * Avoid holding the hash lock in the event that
602 * the buffer is locked by someone. Since the hash chain
603 * may change when we drop the hash lock
604 * we have to start at the beginning of the chain if the
605 * buffer identity/contents aren't valid.
607 if (!sema_tryp(&bp->b_sem)) {
608 biostats.bio_bufbusy.value.ui32++;
609 mutex_exit(hmp);
611 * OK, we are dealing with a busy buffer.
612 * In the case that we are panicking and we
613 * got called from bread(), we have some chance
614 * for error recovery. So better bail out from
615 * here since sema_p() won't block. If we got
616 * called directly from ufs routines, there is
617 * no way to report an error yet.
619 if (panicstr && errflg)
620 goto errout;
622 * For the following line of code to work
623 * correctly never kmem_free the buffer "header".
625 sema_p(&bp->b_sem);
626 if (bp->b_blkno != blkno || bp->b_edev != dev ||
627 (bp->b_flags & B_STALE)) {
628 sema_v(&bp->b_sem);
629 mutex_enter(hmp);
630 goto loop; /* start over */
632 mutex_enter(hmp);
634 /* Found */
635 biostats.bio_hit.value.ui32++;
636 bp->b_flags &= ~B_AGE;
639 * Yank it off the free/delayed write lists
641 hp->b_length--;
642 notavail(bp);
643 mutex_exit(hmp);
645 ASSERT((bp->b_flags & B_NOCACHE) == NULL);
647 if (nbp == NULL) {
649 * Make the common path short.
651 ASSERT(SEMA_HELD(&bp->b_sem));
652 return (bp);
655 biostats.bio_bufdup.value.ui32++;
658 * The buffer must have entered during the lock upgrade
659 * so free the new buffer we allocated and return the
660 * found buffer.
662 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
663 nbp->b_un.b_addr = NULL;
666 * Account for the memory
668 mutex_enter(&bfree_lock);
669 bfreelist.b_bufsize += nbp->b_bufsize;
670 mutex_exit(&bfree_lock);
673 * Destroy buf identity, and place on avail list
675 nbp->b_dev = (o_dev_t)NODEV;
676 nbp->b_edev = NODEV;
677 nbp->b_flags = 0;
678 nbp->b_file = NULL;
679 nbp->b_offset = -1;
681 sema_v(&nbp->b_sem);
682 bio_bhdr_free(nbp);
684 ASSERT(SEMA_HELD(&bp->b_sem));
685 return (bp);
689 * bio_getfreeblk may block so check the hash chain again.
691 if (nbp == NULL) {
692 mutex_exit(hmp);
693 nbp = bio_getfreeblk(bsize);
694 mutex_enter(hmp);
695 goto loop;
699 * New buffer. Assign nbp and stick it on the hash.
701 nbp->b_flags = B_BUSY;
702 nbp->b_edev = dev;
703 nbp->b_dev = (o_dev_t)cmpdev(dev);
704 nbp->b_blkno = blkno;
705 nbp->b_iodone = NULL;
706 nbp->b_bcount = bsize;
708 * If we are given a ufsvfsp and the vfs_root field is NULL
709 * then this must be I/O for a superblock. A superblock's
710 * buffer is set up in mountfs() and there is no root vnode
711 * at that point.
713 if (ufsvfsp && ufsvfsp->vfs_root) {
714 nbp->b_vp = ufsvfsp->vfs_root;
715 } else {
716 nbp->b_vp = NULL;
719 ASSERT((nbp->b_flags & B_NOCACHE) == NULL);
721 binshash(nbp, dp);
722 mutex_exit(hmp);
724 ASSERT(SEMA_HELD(&nbp->b_sem));
726 return (nbp);
730 * Come here in case of an internal error. At this point we couldn't
731 * get a buffer, but we have to return one. Hence we allocate some
732 * kind of error reply buffer on the fly. This buffer is marked as
733 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
734 * - B_ERROR will indicate error to the caller.
735 * - B_DONE will prevent us from reading the buffer from
736 * the device.
737 * - B_NOCACHE will cause that this buffer gets free'd in
738 * brelse().
741 errout:
742 errbp = geteblk();
743 sema_p(&errbp->b_sem);
744 errbp->b_flags &= ~B_BUSY;
745 errbp->b_flags |= (B_ERROR | B_DONE);
746 return (errbp);
750 * Get an empty block, not assigned to any particular device.
751 * Returns a locked buffer that is not on any hash or free list.
753 struct buf *
754 ngeteblk(long bsize)
756 struct buf *bp;
758 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
759 bioinit(bp);
760 bp->av_forw = bp->av_back = NULL;
761 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
762 bp->b_bufsize = bsize;
763 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
764 bp->b_dev = (o_dev_t)NODEV;
765 bp->b_edev = NODEV;
766 bp->b_lblkno = 0;
767 bp->b_bcount = bsize;
768 bp->b_iodone = NULL;
769 return (bp);
773 * Interface of geteblk() is kept intact to maintain driver compatibility.
774 * Use ngeteblk() to allocate block size other than 1 KB.
776 struct buf *
777 geteblk(void)
779 return (ngeteblk((long)1024));
783 * Return a buffer w/o sleeping
785 struct buf *
786 trygetblk(dev_t dev, daddr_t blkno)
788 struct buf *bp;
789 struct buf *dp;
790 struct hbuf *hp;
791 kmutex_t *hmp;
792 uint_t index;
794 index = bio_bhash(dev, blkno);
795 hp = &hbuf[index];
796 hmp = &hp->b_lock;
798 if (!mutex_tryenter(hmp))
799 return (NULL);
801 dp = (struct buf *)hp;
802 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
803 if (bp->b_blkno != blkno || bp->b_edev != dev ||
804 (bp->b_flags & B_STALE))
805 continue;
807 * Get access to a valid buffer without sleeping
809 if (sema_tryp(&bp->b_sem)) {
810 if (bp->b_flags & B_DONE) {
811 hp->b_length--;
812 notavail(bp);
813 mutex_exit(hmp);
814 return (bp);
815 } else {
816 sema_v(&bp->b_sem);
817 break;
820 break;
822 mutex_exit(hmp);
823 return (NULL);
827 * Wait for I/O completion on the buffer; return errors
828 * to the user.
831 iowait(struct buf *bp)
833 ASSERT(SEMA_HELD(&bp->b_sem));
834 return (biowait(bp));
838 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
839 * and wake up anyone waiting for it.
841 void
842 iodone(struct buf *bp)
844 ASSERT(SEMA_HELD(&bp->b_sem));
845 (void) biodone(bp);
849 * Zero the core associated with a buffer.
851 void
852 clrbuf(struct buf *bp)
854 ASSERT(SEMA_HELD(&bp->b_sem));
855 bzero(bp->b_un.b_addr, bp->b_bcount);
856 bp->b_resid = 0;
861 * Make sure all write-behind blocks on dev (or NODEV for all)
862 * are flushed out.
864 void
865 bflush(dev_t dev)
867 struct buf *bp, *dp;
868 struct hbuf *hp;
869 struct buf *delwri_list = EMPTY_LIST;
870 int i, index;
871 kmutex_t *hmp;
873 mutex_enter(&blist_lock);
875 * Wait for any invalidates or flushes ahead of us to finish.
876 * We really could split blist_lock up per device for better
877 * parallelism here.
879 while (bio_doinginval || bio_doingflush) {
880 bio_flinv_cv_wanted = 1;
881 cv_wait(&bio_flushinval_cv, &blist_lock);
883 bio_doingflush++;
885 * Gather all B_DELWRI buffer for device.
886 * Lock ordering is b_sem > hash lock (brelse).
887 * Since we are finding the buffer via the delayed write list,
888 * it may be busy and we would block trying to get the
889 * b_sem lock while holding hash lock. So transfer all the
890 * candidates on the delwri_list and then drop the hash locks.
892 for (i = 0; i < v.v_hbuf; i++) {
893 hmp = &hbuf[i].b_lock;
894 dp = (struct buf *)&dwbuf[i];
895 mutex_enter(hmp);
896 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
897 if (dev == NODEV || bp->b_edev == dev) {
898 if (bp->b_list == NULL) {
899 bp->b_list = delwri_list;
900 delwri_list = bp;
904 mutex_exit(hmp);
906 mutex_exit(&blist_lock);
909 * Now that the hash locks have been dropped grab the semaphores
910 * and write back all the buffers that have B_DELWRI set.
912 while (delwri_list != EMPTY_LIST) {
913 bp = delwri_list;
915 sema_p(&bp->b_sem); /* may block */
916 if ((dev != bp->b_edev && dev != NODEV) ||
917 (panicstr && bp->b_flags & B_BUSY)) {
918 sema_v(&bp->b_sem);
919 delwri_list = bp->b_list;
920 bp->b_list = NULL;
921 continue; /* No longer a candidate */
923 if (bp->b_flags & B_DELWRI) {
924 index = bio_bhash(bp->b_edev, bp->b_blkno);
925 hp = &hbuf[index];
926 hmp = &hp->b_lock;
927 dp = (struct buf *)hp;
929 bp->b_flags |= B_ASYNC;
930 mutex_enter(hmp);
931 hp->b_length--;
932 notavail(bp);
933 mutex_exit(hmp);
934 if (bp->b_vp == NULL) { /* !ufs */
935 BWRITE(bp);
936 } else { /* ufs */
937 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
939 } else {
940 sema_v(&bp->b_sem);
942 delwri_list = bp->b_list;
943 bp->b_list = NULL;
945 mutex_enter(&blist_lock);
946 bio_doingflush--;
947 if (bio_flinv_cv_wanted) {
948 bio_flinv_cv_wanted = 0;
949 cv_broadcast(&bio_flushinval_cv);
951 mutex_exit(&blist_lock);
955 * Ensure that a specified block is up-to-date on disk.
957 void
958 blkflush(dev_t dev, daddr_t blkno)
960 struct buf *bp, *dp;
961 struct hbuf *hp;
962 struct buf *sbp = NULL;
963 uint_t index;
964 kmutex_t *hmp;
966 index = bio_bhash(dev, blkno);
967 hp = &hbuf[index];
968 dp = (struct buf *)hp;
969 hmp = &hp->b_lock;
972 * Identify the buffer in the cache belonging to
973 * this device and blkno (if any).
975 mutex_enter(hmp);
976 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
977 if (bp->b_blkno != blkno || bp->b_edev != dev ||
978 (bp->b_flags & B_STALE))
979 continue;
980 sbp = bp;
981 break;
983 mutex_exit(hmp);
984 if (sbp == NULL)
985 return;
987 * Now check the buffer we have identified and
988 * make sure it still belongs to the device and is B_DELWRI
990 sema_p(&sbp->b_sem);
991 if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
992 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
993 mutex_enter(hmp);
994 hp->b_length--;
995 notavail(sbp);
996 mutex_exit(hmp);
998 * XXX - There is nothing to guarantee a synchronous
999 * write here if the B_ASYNC flag is set. This needs
1000 * some investigation.
1002 if (sbp->b_vp == NULL) { /* !ufs */
1003 BWRITE(sbp); /* synchronous write */
1004 } else { /* ufs */
1005 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
1007 } else {
1008 sema_v(&sbp->b_sem);
1013 * Same as binval, except can force-invalidate delayed-write buffers
1014 * (which are not be already flushed because of device errors). Also
1015 * makes sure that the retry write flag is cleared.
1018 bfinval(dev_t dev, int force)
1020 struct buf *dp;
1021 struct buf *bp;
1022 struct buf *binval_list = EMPTY_LIST;
1023 int i, error = 0;
1024 kmutex_t *hmp;
1025 uint_t index;
1026 struct buf **backp;
1028 mutex_enter(&blist_lock);
1030 * Wait for any flushes ahead of us to finish, it's ok to
1031 * do invalidates in parallel.
1033 while (bio_doingflush) {
1034 bio_flinv_cv_wanted = 1;
1035 cv_wait(&bio_flushinval_cv, &blist_lock);
1037 bio_doinginval++;
1039 /* Gather bp's */
1040 for (i = 0; i < v.v_hbuf; i++) {
1041 dp = (struct buf *)&hbuf[i];
1042 hmp = &hbuf[i].b_lock;
1044 mutex_enter(hmp);
1045 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1046 if (bp->b_edev == dev) {
1047 if (bp->b_list == NULL) {
1048 bp->b_list = binval_list;
1049 binval_list = bp;
1053 mutex_exit(hmp);
1055 mutex_exit(&blist_lock);
1057 /* Invalidate all bp's found */
1058 while (binval_list != EMPTY_LIST) {
1059 bp = binval_list;
1061 sema_p(&bp->b_sem);
1062 if (bp->b_edev == dev) {
1063 if (force && (bp->b_flags & B_DELWRI)) {
1064 /* clear B_DELWRI, move to non-dw freelist */
1065 index = bio_bhash(bp->b_edev, bp->b_blkno);
1066 hmp = &hbuf[index].b_lock;
1067 dp = (struct buf *)&hbuf[index];
1068 mutex_enter(hmp);
1070 /* remove from delayed write freelist */
1071 notavail(bp);
1073 /* add to B_AGE side of non-dw freelist */
1074 backp = &dp->av_forw;
1075 (*backp)->av_back = bp;
1076 bp->av_forw = *backp;
1077 *backp = bp;
1078 bp->av_back = dp;
1081 * make sure write retries and busy are cleared
1083 bp->b_flags &=
1084 ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1085 mutex_exit(hmp);
1087 if ((bp->b_flags & B_DELWRI) == 0)
1088 bp->b_flags |= B_STALE|B_AGE;
1089 else
1090 error = EIO;
1092 sema_v(&bp->b_sem);
1093 binval_list = bp->b_list;
1094 bp->b_list = NULL;
1096 mutex_enter(&blist_lock);
1097 bio_doinginval--;
1098 if (bio_flinv_cv_wanted) {
1099 cv_broadcast(&bio_flushinval_cv);
1100 bio_flinv_cv_wanted = 0;
1102 mutex_exit(&blist_lock);
1103 return (error);
1107 * If possible, invalidate blocks for a dev on demand
1109 void
1110 binval(dev_t dev)
1112 (void) bfinval(dev, 0);
1116 * Initialize the buffer I/O system by freeing
1117 * all buffers and setting all device hash buffer lists to empty.
1119 void
1120 binit(void)
1122 struct buf *bp;
1123 unsigned int i, pct;
1124 ulong_t bio_max_hwm, bio_default_hwm;
1127 * Maximum/Default values for bufhwm are set to the smallest of:
1128 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1129 * - 1/4 of kernel virtual memory
1130 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1131 * Additionally, in order to allow simple tuning by percentage of
1132 * physical memory, bufhwm_pct is used to calculate the default if
1133 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1135 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1136 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1138 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1139 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1140 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1142 pct = BIO_BUF_PERCENT;
1143 if (bufhwm_pct != 0 &&
1144 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1145 pct = BIO_BUF_PERCENT;
1147 * Invalid user specified value, emit a warning.
1149 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1150 range(1..%d). Using %d as default.",
1151 bufhwm_pct,
1152 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1155 bio_default_hwm = MIN(physmem / pct,
1156 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1157 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1159 if ((v.v_bufhwm = bufhwm) == 0)
1160 v.v_bufhwm = bio_default_hwm;
1162 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1163 v.v_bufhwm = (int)bio_max_hwm;
1165 * Invalid user specified value, emit a warning.
1167 cmn_err(CE_WARN,
1168 "binit: bufhwm(%d) out \
1169 of range(%d..%lu). Using %lu as default",
1170 bufhwm,
1171 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1175 * Determine the number of hash buckets. Default is to
1176 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1177 * Round up number to the next power of 2.
1179 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1180 BIO_HASHLEN);
1181 v.v_hmask = v.v_hbuf - 1;
1182 v.v_buf = BIO_BHDR_POOL;
1184 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1186 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1188 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1189 bp = &bfreelist;
1190 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1192 for (i = 0; i < v.v_hbuf; i++) {
1193 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1194 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1197 * Initialize the delayed write buffer list.
1199 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1200 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1205 * Wait for I/O completion on the buffer; return error code.
1206 * If bp was for synchronous I/O, bp is invalid and associated
1207 * resources are freed on return.
1210 biowait(struct buf *bp)
1212 int error = 0;
1213 struct cpu *cpup;
1215 ASSERT(SEMA_HELD(&bp->b_sem));
1217 cpup = CPU;
1218 atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1219 DTRACE_IO1(wait__start, struct buf *, bp);
1222 * In case of panic, busy wait for completion
1224 if (panicstr) {
1225 while ((bp->b_flags & B_DONE) == 0)
1226 drv_usecwait(10);
1227 } else
1228 sema_p(&bp->b_io);
1230 DTRACE_IO1(wait__done, struct buf *, bp);
1231 atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1233 error = geterror(bp);
1234 if ((bp->b_flags & B_ASYNC) == 0) {
1235 if (bp->b_flags & B_REMAPPED)
1236 bp_mapout(bp);
1238 return (error);
1241 static void
1242 biodone_tnf_probe(struct buf *bp)
1244 /* Kernel probe */
1245 TNF_PROBE_3(biodone, "io blockio", /* CSTYLED */,
1246 tnf_device, device, bp->b_edev,
1247 tnf_diskaddr, block, bp->b_lblkno,
1248 tnf_opaque, buf, bp);
1252 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1253 * and wake up anyone waiting for it.
1255 void
1256 biodone(struct buf *bp)
1258 if (bp->b_flags & B_STARTED) {
1259 DTRACE_IO1(done, struct buf *, bp);
1260 bp->b_flags &= ~B_STARTED;
1264 * Call the TNF probe here instead of the inline code
1265 * to force our compiler to use the tail call optimization.
1267 biodone_tnf_probe(bp);
1269 if (bp->b_iodone != NULL) {
1270 (*(bp->b_iodone))(bp);
1271 return;
1273 ASSERT((bp->b_flags & B_DONE) == 0);
1274 ASSERT(SEMA_HELD(&bp->b_sem));
1275 bp->b_flags |= B_DONE;
1276 if (bp->b_flags & B_ASYNC) {
1277 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1278 bio_pageio_done(bp);
1279 else
1280 brelse(bp); /* release bp to freelist */
1281 } else {
1282 sema_v(&bp->b_io);
1287 * Pick up the device's error number and pass it to the user;
1288 * if there is an error but the number is 0 set a generalized code.
1291 geterror(struct buf *bp)
1293 int error = 0;
1295 ASSERT(SEMA_HELD(&bp->b_sem));
1296 if (bp->b_flags & B_ERROR) {
1297 error = bp->b_error;
1298 if (!error)
1299 error = EIO;
1301 return (error);
1305 * Support for pageio buffers.
1307 * This stuff should be generalized to provide a generalized bp
1308 * header facility that can be used for things other than pageio.
1312 * Allocate and initialize a buf struct for use with pageio.
1314 struct buf *
1315 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1317 struct buf *bp;
1318 struct cpu *cpup;
1320 if (flags & B_READ) {
1321 CPU_STATS_ENTER_K();
1322 cpup = CPU; /* get pointer AFTER preemption is disabled */
1323 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1324 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1326 atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1328 if ((flags & B_ASYNC) == 0) {
1329 klwp_t *lwp = ttolwp(curthread);
1330 if (lwp != NULL)
1331 lwp->lwp_ru.majflt++;
1332 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1333 /* Kernel probe */
1334 TNF_PROBE_2(major_fault, "vm pagefault", /* CSTYLED */,
1335 tnf_opaque, vnode, pp->p_vnode,
1336 tnf_offset, offset, pp->p_offset);
1339 * Update statistics for pages being paged in
1341 if (pp != NULL && pp->p_vnode != NULL) {
1342 if (IS_SWAPFSVP(pp->p_vnode)) {
1343 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1344 atomic_add_64(&curzone->zone_anonpgin,
1345 btopr(len));
1346 } else {
1347 if (pp->p_vnode->v_flag & VVMEXEC) {
1348 CPU_STATS_ADDQ(cpup, vm, execpgin,
1349 btopr(len));
1350 atomic_add_64(&curzone->zone_execpgin,
1351 btopr(len));
1352 } else {
1353 CPU_STATS_ADDQ(cpup, vm, fspgin,
1354 btopr(len));
1355 atomic_add_64(&curzone->zone_fspgin,
1356 btopr(len));
1360 CPU_STATS_EXIT_K();
1361 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1362 "page_ws_in:pp %p", pp);
1363 /* Kernel probe */
1364 TNF_PROBE_3(pagein, "vm pageio io", /* CSTYLED */,
1365 tnf_opaque, vnode, pp->p_vnode,
1366 tnf_offset, offset, pp->p_offset,
1367 tnf_size, size, len);
1370 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1371 bp->b_bcount = len;
1372 bp->b_bufsize = len;
1373 bp->b_pages = pp;
1374 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1375 bp->b_offset = -1;
1376 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1378 /* Initialize bp->b_sem in "locked" state */
1379 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1381 VN_HOLD(vp);
1382 bp->b_vp = vp;
1383 THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1386 * Caller sets dev & blkno and can adjust
1387 * b_addr for page offset and can use bp_mapin
1388 * to make pages kernel addressable.
1390 return (bp);
1393 void
1394 pageio_done(struct buf *bp)
1396 ASSERT(SEMA_HELD(&bp->b_sem));
1397 if (bp->b_flags & B_REMAPPED)
1398 bp_mapout(bp);
1399 VN_RELE(bp->b_vp);
1400 bp->b_vp = NULL;
1401 ASSERT((bp->b_flags & B_NOCACHE) != 0);
1403 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1404 sema_destroy(&bp->b_sem);
1405 sema_destroy(&bp->b_io);
1406 kmem_free(bp, sizeof (struct buf));
1410 * Check to see whether the buffers, except the one pointed by sbp,
1411 * associated with the device are busy.
1412 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1415 bcheck(dev_t dev, struct buf *sbp)
1417 struct buf *bp;
1418 struct buf *dp;
1419 int i;
1420 kmutex_t *hmp;
1423 * check for busy bufs for this filesystem
1425 for (i = 0; i < v.v_hbuf; i++) {
1426 dp = (struct buf *)&hbuf[i];
1427 hmp = &hbuf[i].b_lock;
1429 mutex_enter(hmp);
1430 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1432 * if buf is busy or dirty, then filesystem is busy
1434 if ((bp->b_edev == dev) &&
1435 ((bp->b_flags & B_STALE) == 0) &&
1436 (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1437 (bp != sbp)) {
1438 mutex_exit(hmp);
1439 return (1);
1442 mutex_exit(hmp);
1444 return (0);
1448 * Hash two 32 bit entities.
1451 hash2ints(int x, int y)
1453 int hash = 0;
1455 hash = x - 1;
1456 hash = ((hash * 7) + (x >> 8)) - 1;
1457 hash = ((hash * 7) + (x >> 16)) - 1;
1458 hash = ((hash * 7) + (x >> 24)) - 1;
1459 hash = ((hash * 7) + y) - 1;
1460 hash = ((hash * 7) + (y >> 8)) - 1;
1461 hash = ((hash * 7) + (y >> 16)) - 1;
1462 hash = ((hash * 7) + (y >> 24)) - 1;
1464 return (hash);
1469 * Return a new buffer struct.
1470 * Create a new buffer if we haven't gone over our high water
1471 * mark for memory, otherwise try to get one off the freelist.
1473 * Returns a locked buf that has no id and is not on any hash or free
1474 * list.
1476 static struct buf *
1477 bio_getfreeblk(long bsize)
1479 struct buf *bp, *dp;
1480 struct hbuf *hp;
1481 kmutex_t *hmp;
1482 uint_t start, end;
1485 * mutex_enter(&bfree_lock);
1486 * bfreelist.b_bufsize represents the amount of memory
1487 * mutex_exit(&bfree_lock); protect ref to bfreelist
1488 * we are allowed to allocate in the cache before we hit our hwm.
1490 bio_mem_get(bsize); /* Account for our memory request */
1492 again:
1493 bp = bio_bhdr_alloc(); /* Get a buf hdr */
1494 sema_p(&bp->b_sem); /* Should never fail */
1496 ASSERT(bp->b_un.b_addr == NULL);
1497 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1498 if (bp->b_un.b_addr != NULL) {
1500 * Make the common path short
1502 bp->b_bufsize = bsize;
1503 ASSERT(SEMA_HELD(&bp->b_sem));
1504 return (bp);
1505 } else {
1506 struct buf *save;
1508 save = bp; /* Save bp we allocated */
1509 start = end = lastindex;
1511 biostats.bio_bufwant.value.ui32++;
1514 * Memory isn't available from the system now. Scan
1515 * the hash buckets till enough space is found.
1517 do {
1518 hp = &hbuf[start];
1519 hmp = &hp->b_lock;
1520 dp = (struct buf *)hp;
1522 mutex_enter(hmp);
1523 bp = dp->av_forw;
1525 while (bp != dp) {
1527 ASSERT(bp != NULL);
1529 if (!sema_tryp(&bp->b_sem)) {
1530 bp = bp->av_forw;
1531 continue;
1535 * Since we are going down the freelist
1536 * associated with this hash bucket the
1537 * B_DELWRI flag should not be set.
1539 ASSERT(!(bp->b_flags & B_DELWRI));
1541 if (bp->b_bufsize == bsize) {
1542 hp->b_length--;
1543 notavail(bp);
1544 bremhash(bp);
1545 mutex_exit(hmp);
1548 * Didn't kmem_alloc any more, so don't
1549 * count it twice.
1551 mutex_enter(&bfree_lock);
1552 bfreelist.b_bufsize += bsize;
1553 mutex_exit(&bfree_lock);
1556 * Update the lastindex value.
1558 lastindex = start;
1561 * Put our saved bp back on the list
1563 sema_v(&save->b_sem);
1564 bio_bhdr_free(save);
1565 ASSERT(SEMA_HELD(&bp->b_sem));
1566 return (bp);
1568 sema_v(&bp->b_sem);
1569 bp = bp->av_forw;
1571 mutex_exit(hmp);
1572 start = ((start + 1) % v.v_hbuf);
1573 } while (start != end);
1575 biostats.bio_bufwait.value.ui32++;
1576 bp = save; /* Use original bp */
1577 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1580 bp->b_bufsize = bsize;
1581 ASSERT(SEMA_HELD(&bp->b_sem));
1582 return (bp);
1586 * Allocate a buffer header. If none currently available, allocate
1587 * a new pool.
1589 static struct buf *
1590 bio_bhdr_alloc(void)
1592 struct buf *dp, *sdp;
1593 struct buf *bp;
1594 int i;
1596 for (;;) {
1597 mutex_enter(&bhdr_lock);
1598 if (bhdrlist != NULL) {
1599 bp = bhdrlist;
1600 bhdrlist = bp->av_forw;
1601 mutex_exit(&bhdr_lock);
1602 bp->av_forw = NULL;
1603 return (bp);
1605 mutex_exit(&bhdr_lock);
1608 * Need to allocate a new pool. If the system is currently
1609 * out of memory, then try freeing things on the freelist.
1611 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1612 if (dp == NULL) {
1614 * System can't give us a pool of headers, try
1615 * recycling from the free lists.
1617 bio_recycle(BIO_HEADER, 0);
1618 } else {
1619 sdp = dp;
1620 for (i = 0; i < v.v_buf; i++, dp++) {
1622 * The next two lines are needed since NODEV
1623 * is -1 and not NULL
1625 dp->b_dev = (o_dev_t)NODEV;
1626 dp->b_edev = NODEV;
1627 dp->av_forw = dp + 1;
1628 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1629 NULL);
1630 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1631 NULL);
1632 dp->b_offset = -1;
1634 mutex_enter(&bhdr_lock);
1635 (--dp)->av_forw = bhdrlist; /* Fix last pointer */
1636 bhdrlist = sdp;
1637 nbuf += v.v_buf;
1638 bp = bhdrlist;
1639 bhdrlist = bp->av_forw;
1640 mutex_exit(&bhdr_lock);
1642 bp->av_forw = NULL;
1643 return (bp);
1648 static void
1649 bio_bhdr_free(struct buf *bp)
1651 ASSERT(bp->b_back == NULL);
1652 ASSERT(bp->b_forw == NULL);
1653 ASSERT(bp->av_back == NULL);
1654 ASSERT(bp->av_forw == NULL);
1655 ASSERT(bp->b_un.b_addr == NULL);
1656 ASSERT(bp->b_dev == (o_dev_t)NODEV);
1657 ASSERT(bp->b_edev == NODEV);
1658 ASSERT(bp->b_flags == 0);
1660 mutex_enter(&bhdr_lock);
1661 bp->av_forw = bhdrlist;
1662 bhdrlist = bp;
1663 mutex_exit(&bhdr_lock);
1667 * If we haven't gone over the high water mark, it's o.k. to
1668 * allocate more buffer space, otherwise recycle buffers
1669 * from the freelist until enough memory is free for a bsize request.
1671 * We account for this memory, even though
1672 * we don't allocate it here.
1674 static void
1675 bio_mem_get(long bsize)
1677 mutex_enter(&bfree_lock);
1678 if (bfreelist.b_bufsize > bsize) {
1679 bfreelist.b_bufsize -= bsize;
1680 mutex_exit(&bfree_lock);
1681 return;
1683 mutex_exit(&bfree_lock);
1684 bio_recycle(BIO_MEM, bsize);
1688 * flush a list of delayed write buffers.
1689 * (currently used only by bio_recycle below.)
1691 static void
1692 bio_flushlist(struct buf *delwri_list)
1694 struct buf *bp;
1696 while (delwri_list != EMPTY_LIST) {
1697 bp = delwri_list;
1698 bp->b_flags |= B_AGE | B_ASYNC;
1699 if (bp->b_vp == NULL) { /* !ufs */
1700 BWRITE(bp);
1701 } else { /* ufs */
1702 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1704 delwri_list = bp->b_list;
1705 bp->b_list = NULL;
1710 * Start recycling buffers on the freelist for one of 2 reasons:
1711 * - we need a buffer header
1712 * - we need to free up memory
1713 * Once started we continue to recycle buffers until the B_AGE
1714 * buffers are gone.
1716 static void
1717 bio_recycle(int want, long bsize)
1719 struct buf *bp, *dp, *dwp, *nbp;
1720 struct hbuf *hp;
1721 int found = 0;
1722 kmutex_t *hmp;
1723 int start, end;
1724 struct buf *delwri_list = EMPTY_LIST;
1727 * Recycle buffers.
1729 top:
1730 start = end = lastindex;
1731 do {
1732 hp = &hbuf[start];
1733 hmp = &hp->b_lock;
1734 dp = (struct buf *)hp;
1736 mutex_enter(hmp);
1737 bp = dp->av_forw;
1739 while (bp != dp) {
1741 ASSERT(bp != NULL);
1743 if (!sema_tryp(&bp->b_sem)) {
1744 bp = bp->av_forw;
1745 continue;
1748 * Do we really want to nuke all of the B_AGE stuff??
1750 if ((bp->b_flags & B_AGE) == 0 && found) {
1751 sema_v(&bp->b_sem);
1752 mutex_exit(hmp);
1753 lastindex = start;
1754 return; /* All done */
1757 ASSERT(MUTEX_HELD(&hp->b_lock));
1758 ASSERT(!(bp->b_flags & B_DELWRI));
1759 hp->b_length--;
1760 notavail(bp);
1763 * Remove bhdr from cache, free up memory,
1764 * and add the hdr to the freelist.
1766 bremhash(bp);
1767 mutex_exit(hmp);
1769 if (bp->b_bufsize) {
1770 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1771 bp->b_un.b_addr = NULL;
1772 mutex_enter(&bfree_lock);
1773 bfreelist.b_bufsize += bp->b_bufsize;
1774 mutex_exit(&bfree_lock);
1777 bp->b_dev = (o_dev_t)NODEV;
1778 bp->b_edev = NODEV;
1779 bp->b_flags = 0;
1780 sema_v(&bp->b_sem);
1781 bio_bhdr_free(bp);
1782 if (want == BIO_HEADER) {
1783 found = 1;
1784 } else {
1785 ASSERT(want == BIO_MEM);
1786 if (!found && bfreelist.b_bufsize >= bsize) {
1787 /* Account for the memory we want */
1788 mutex_enter(&bfree_lock);
1789 if (bfreelist.b_bufsize >= bsize) {
1790 bfreelist.b_bufsize -= bsize;
1791 found = 1;
1793 mutex_exit(&bfree_lock);
1798 * Since we dropped hmp start from the
1799 * begining.
1801 mutex_enter(hmp);
1802 bp = dp->av_forw;
1804 mutex_exit(hmp);
1807 * Look at the delayed write list.
1808 * First gather into a private list, then write them.
1810 dwp = (struct buf *)&dwbuf[start];
1811 mutex_enter(&blist_lock);
1812 bio_doingflush++;
1813 mutex_enter(hmp);
1814 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1816 ASSERT(bp != NULL);
1817 nbp = bp->av_forw;
1819 if (!sema_tryp(&bp->b_sem))
1820 continue;
1821 ASSERT(bp->b_flags & B_DELWRI);
1823 * Do we really want to nuke all of the B_AGE stuff??
1826 if ((bp->b_flags & B_AGE) == 0 && found) {
1827 sema_v(&bp->b_sem);
1828 mutex_exit(hmp);
1829 lastindex = start;
1830 mutex_exit(&blist_lock);
1831 bio_flushlist(delwri_list);
1832 mutex_enter(&blist_lock);
1833 bio_doingflush--;
1834 if (bio_flinv_cv_wanted) {
1835 bio_flinv_cv_wanted = 0;
1836 cv_broadcast(&bio_flushinval_cv);
1838 mutex_exit(&blist_lock);
1839 return; /* All done */
1843 * If the buffer is already on a flush or
1844 * invalidate list then just skip it.
1846 if (bp->b_list != NULL) {
1847 sema_v(&bp->b_sem);
1848 continue;
1851 * We are still on the same bucket.
1853 hp->b_length--;
1854 notavail(bp);
1855 bp->b_list = delwri_list;
1856 delwri_list = bp;
1858 mutex_exit(hmp);
1859 mutex_exit(&blist_lock);
1860 bio_flushlist(delwri_list);
1861 delwri_list = EMPTY_LIST;
1862 mutex_enter(&blist_lock);
1863 bio_doingflush--;
1864 if (bio_flinv_cv_wanted) {
1865 bio_flinv_cv_wanted = 0;
1866 cv_broadcast(&bio_flushinval_cv);
1868 mutex_exit(&blist_lock);
1869 start = (start + 1) % v.v_hbuf;
1871 } while (start != end);
1873 if (found)
1874 return;
1877 * Free lists exhausted and we haven't satisfied the request.
1878 * Wait here for more entries to be added to freelist.
1879 * Because this might have just happened, make it timed.
1881 mutex_enter(&bfree_lock);
1882 bfreelist.b_flags |= B_WANTED;
1883 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1884 mutex_exit(&bfree_lock);
1885 goto top;
1889 * See if the block is associated with some buffer
1890 * (mainly to avoid getting hung up on a wait in breada).
1892 static int
1893 bio_incore(dev_t dev, daddr_t blkno)
1895 struct buf *bp;
1896 struct buf *dp;
1897 uint_t index;
1898 kmutex_t *hmp;
1900 index = bio_bhash(dev, blkno);
1901 dp = (struct buf *)&hbuf[index];
1902 hmp = &hbuf[index].b_lock;
1904 mutex_enter(hmp);
1905 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1906 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1907 (bp->b_flags & B_STALE) == 0) {
1908 mutex_exit(hmp);
1909 return (1);
1912 mutex_exit(hmp);
1913 return (0);
1916 static void
1917 bio_pageio_done(struct buf *bp)
1919 if (bp->b_flags & B_PAGEIO) {
1921 if (bp->b_flags & B_REMAPPED)
1922 bp_mapout(bp);
1924 if (bp->b_flags & B_READ)
1925 pvn_read_done(bp->b_pages, bp->b_flags);
1926 else
1927 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1928 pageio_done(bp);
1929 } else {
1930 ASSERT(bp->b_flags & B_REMAPPED);
1931 bp_mapout(bp);
1932 brelse(bp);
1937 * bioerror(9F) - indicate error in buffer header
1938 * If 'error' is zero, remove the error indication.
1940 void
1941 bioerror(struct buf *bp, int error)
1943 ASSERT(bp != NULL);
1944 ASSERT(error >= 0);
1945 ASSERT(SEMA_HELD(&bp->b_sem));
1947 if (error != 0) {
1948 bp->b_flags |= B_ERROR;
1949 } else {
1950 bp->b_flags &= ~B_ERROR;
1952 bp->b_error = error;
1956 * bioreset(9F) - reuse a private buffer header after I/O is complete
1958 void
1959 bioreset(struct buf *bp)
1961 ASSERT(bp != NULL);
1963 biofini(bp);
1964 bioinit(bp);
1968 * biosize(9F) - return size of a buffer header
1970 size_t
1971 biosize(void)
1973 return (sizeof (struct buf));
1977 * biomodified(9F) - check if buffer is modified
1980 biomodified(struct buf *bp)
1982 int npf;
1983 int ppattr;
1984 struct page *pp;
1986 ASSERT(bp != NULL);
1988 if ((bp->b_flags & B_PAGEIO) == 0) {
1989 return (-1);
1991 pp = bp->b_pages;
1992 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1994 while (npf > 0) {
1995 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1996 HAT_SYNC_STOPON_MOD);
1997 if (ppattr & P_MOD)
1998 return (1);
1999 pp = pp->p_next;
2000 npf--;
2003 return (0);
2007 * bioinit(9F) - initialize a buffer structure
2009 void
2010 bioinit(struct buf *bp)
2012 bzero(bp, sizeof (struct buf));
2013 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
2014 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
2015 bp->b_offset = -1;
2019 * biofini(9F) - uninitialize a buffer structure
2021 void
2022 biofini(struct buf *bp)
2024 sema_destroy(&bp->b_io);
2025 sema_destroy(&bp->b_sem);
2029 * bioclone(9F) - clone a buffer
2031 struct buf *
2032 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
2033 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
2035 struct buf *bufp;
2037 ASSERT(bp);
2038 if (bp_mem == NULL) {
2039 bufp = kmem_alloc(sizeof (struct buf), sleep);
2040 if (bufp == NULL) {
2041 return (NULL);
2043 bioinit(bufp);
2044 } else {
2045 bufp = bp_mem;
2046 bioreset(bufp);
2049 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2050 B_ABRWRITE)
2053 * The cloned buffer does not inherit the B_REMAPPED flag.
2055 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY;
2056 bufp->b_bcount = len;
2057 bufp->b_blkno = blkno;
2058 bufp->b_iodone = iodone;
2059 bufp->b_proc = bp->b_proc;
2060 bufp->b_edev = dev;
2061 bufp->b_file = bp->b_file;
2062 bufp->b_offset = bp->b_offset;
2064 if (bp->b_flags & B_SHADOW) {
2065 ASSERT(bp->b_shadow);
2066 ASSERT(bp->b_flags & B_PHYS);
2068 bufp->b_shadow = bp->b_shadow +
2069 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
2070 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2071 if (bp->b_flags & B_REMAPPED)
2072 bufp->b_proc = NULL;
2073 } else {
2074 if (bp->b_flags & B_PAGEIO) {
2075 struct page *pp;
2076 off_t o;
2077 int i;
2079 pp = bp->b_pages;
2080 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2081 for (i = btop(o); i > 0; i--) {
2082 pp = pp->p_next;
2084 bufp->b_pages = pp;
2085 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2086 } else {
2087 bufp->b_un.b_addr =
2088 (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2089 if (bp->b_flags & B_REMAPPED)
2090 bufp->b_proc = NULL;
2093 return (bufp);