4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
39 #include <sys/types.h>
40 #include <sys/t_lock.h>
41 #include <sys/sysmacros.h>
43 #include <sys/cpuvar.h>
44 #include <sys/errno.h>
45 #include <sys/debug.h>
48 #include <sys/vnode.h>
49 #include <sys/bitmap.h>
50 #include <sys/cmn_err.h>
53 #include <sys/atomic.h>
54 #include <vm/seg_kmem.h>
57 #include <sys/vtrace.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/fs/ufs_inode.h>
60 #include <sys/fs/ufs_bio.h>
61 #include <sys/fs/ufs_log.h>
62 #include <sys/systm.h>
67 static kmutex_t blist_lock
; /* protects b_list */
68 static kmutex_t bhdr_lock
; /* protects the bhdrlist */
69 static kmutex_t bfree_lock
; /* protects the bfreelist structure */
71 struct hbuf
*hbuf
; /* Hash buckets */
72 struct dwbuf
*dwbuf
; /* Delayed write buckets */
73 static struct buf
*bhdrlist
; /* buf header free list */
74 static int nbuf
; /* number of buffer headers allocated */
76 static int lastindex
; /* Reference point on where to start */
77 /* when looking for free buffers */
79 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
80 #define EMPTY_LIST ((struct buf *)-1)
82 static kcondvar_t bio_mem_cv
; /* Condition variables */
83 static kcondvar_t bio_flushinval_cv
;
84 static int bio_doingflush
; /* flush in progress */
85 static int bio_doinginval
; /* inval in progress */
86 static int bio_flinv_cv_wanted
; /* someone waiting for cv */
89 * Statistics on the buffer cache
91 struct biostats biostats
= {
92 { "buffer_cache_lookups", KSTAT_DATA_UINT32
},
93 { "buffer_cache_hits", KSTAT_DATA_UINT32
},
94 { "new_buffer_requests", KSTAT_DATA_UINT32
},
95 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32
},
96 { "buffers_locked_by_someone", KSTAT_DATA_UINT32
},
97 { "duplicate_buffers_found", KSTAT_DATA_UINT32
}
103 kstat_named_t
*biostats_ptr
= (kstat_named_t
*)&biostats
;
104 uint_t biostats_ndata
= (uint_t
)(sizeof (biostats
) /
105 sizeof (kstat_named_t
));
108 * Statistics on ufs buffer cache
109 * Not protected by locks
111 struct ufsbiostats ub
= {
112 { "breads", KSTAT_DATA_UINT32
},
113 { "bwrites", KSTAT_DATA_UINT32
},
114 { "fbiwrites", KSTAT_DATA_UINT32
},
115 { "getpages", KSTAT_DATA_UINT32
},
116 { "getras", KSTAT_DATA_UINT32
},
117 { "putsyncs", KSTAT_DATA_UINT32
},
118 { "putasyncs", KSTAT_DATA_UINT32
},
119 { "putpageios", KSTAT_DATA_UINT32
},
123 * more UFS Logging eccentricities...
125 * required since "#pragma weak ..." doesn't work in reverse order.
126 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
127 * to ufs routines don't get plugged into bio.c calls so
128 * we initialize it when setting up the "lufsops" table
129 * in "lufs.c:_init()"
131 void (*bio_lufs_strategy
)(void *, buf_t
*);
132 void (*bio_snapshot_strategy
)(void *, buf_t
*);
135 /* Private routines */
136 static struct buf
*bio_getfreeblk(long);
137 static void bio_mem_get(long);
138 static void bio_bhdr_free(struct buf
*);
139 static struct buf
*bio_bhdr_alloc(void);
140 static void bio_recycle(int, long);
141 static void bio_pageio_done(struct buf
*);
142 static int bio_incore(dev_t
, daddr_t
);
145 * Buffer cache constants
147 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
148 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
149 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
150 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
151 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
152 #define BIO_HASHLEN 4 /* Target length of hash chains */
155 /* Flags for bio_recycle() */
156 #define BIO_HEADER 0x01
159 extern int bufhwm
; /* User tunable - high water mark for mem */
160 extern int bufhwm_pct
; /* ditto - given in % of physmem */
163 * The following routines allocate and free
164 * buffers with various side effects. In general the
165 * arguments to an allocate routine are a device and
166 * a block number, and the value is a pointer to
167 * to the buffer header; the buffer returned is locked with a
168 * binary semaphore so that no one else can touch it. If the block was
169 * already in core, no I/O need be done; if it is
170 * already locked, the process waits until it becomes free.
171 * The following routines allocate a buffer:
175 * Eventually the buffer must be released, possibly with the
176 * side effect of writing it out, by using one of
177 * bwrite/BWRITE/brwrite
182 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
183 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
184 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
185 * B_DONE is still used to denote a buffer with I/O complete on it.
187 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
188 * should not be used where a very accurate count of the free buffers is
193 * Read in (if necessary) the block and return a buffer pointer.
195 * This interface is provided for binary compatibility. Using
196 * BREAD() directly avoids the extra function call overhead invoked
197 * by calling this routine.
200 bread(dev_t dev
, daddr_t blkno
, long bsize
)
202 return (BREAD(dev
, blkno
, bsize
));
206 * Common code for reading a buffer with various options
208 * Read in (if necessary) the block and return a buffer pointer.
211 bread_common(void *arg
, dev_t dev
, daddr_t blkno
, long bsize
)
213 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)arg
;
215 klwp_t
*lwp
= ttolwp(curthread
);
217 CPU_STATS_ADD_K(sys
, lread
, 1);
218 bp
= getblk_common(ufsvfsp
, dev
, blkno
, bsize
, /* errflg */ 1);
219 if (bp
->b_flags
& B_DONE
)
221 bp
->b_flags
|= B_READ
;
222 ASSERT(bp
->b_bcount
== bsize
);
223 if (ufsvfsp
== NULL
) { /* !ufs */
224 (void) bdev_strategy(bp
);
225 } else if (ufsvfsp
->vfs_log
&& bio_lufs_strategy
!= NULL
) {
227 (*bio_lufs_strategy
)(ufsvfsp
->vfs_log
, bp
);
228 } else if (ufsvfsp
->vfs_snapshot
&& bio_snapshot_strategy
!= NULL
) {
229 /* ufs && snapshots */
230 (*bio_snapshot_strategy
)(&ufsvfsp
->vfs_snapshot
, bp
);
232 ufsvfsp
->vfs_iotstamp
= ddi_get_lbolt();
233 ub
.ub_breads
.value
.ul
++; /* ufs && !logging */
234 (void) bdev_strategy(bp
);
237 lwp
->lwp_ru
.inblock
++;
238 CPU_STATS_ADD_K(sys
, bread
, 1);
244 * Read in the block, like bread, but also start I/O on the
245 * read-ahead block (which is not allocated to the caller).
248 breada(dev_t dev
, daddr_t blkno
, daddr_t rablkno
, long bsize
)
250 struct buf
*bp
, *rabp
;
251 klwp_t
*lwp
= ttolwp(curthread
);
254 if (!bio_incore(dev
, blkno
)) {
255 CPU_STATS_ADD_K(sys
, lread
, 1);
256 bp
= GETBLK(dev
, blkno
, bsize
);
257 if ((bp
->b_flags
& B_DONE
) == 0) {
258 bp
->b_flags
|= B_READ
;
259 bp
->b_bcount
= bsize
;
260 (void) bdev_strategy(bp
);
262 lwp
->lwp_ru
.inblock
++;
263 CPU_STATS_ADD_K(sys
, bread
, 1);
266 if (rablkno
&& bfreelist
.b_bcount
> 1 &&
267 !bio_incore(dev
, rablkno
)) {
268 rabp
= GETBLK(dev
, rablkno
, bsize
);
269 if (rabp
->b_flags
& B_DONE
)
272 rabp
->b_flags
|= B_READ
|B_ASYNC
;
273 rabp
->b_bcount
= bsize
;
274 (void) bdev_strategy(rabp
);
276 lwp
->lwp_ru
.inblock
++;
277 CPU_STATS_ADD_K(sys
, bread
, 1);
281 return (BREAD(dev
, blkno
, bsize
));
287 * Common code for writing a buffer with various options.
289 * force_wait - wait for write completion regardless of B_ASYNC flag
290 * do_relse - release the buffer when we are done
291 * clear_flags - flags to clear from the buffer
294 bwrite_common(void *arg
, struct buf
*bp
, int force_wait
,
295 int do_relse
, int clear_flags
)
297 register int do_wait
;
298 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)arg
;
300 klwp_t
*lwp
= ttolwp(curthread
);
303 ASSERT(SEMA_HELD(&bp
->b_sem
));
305 bp
->b_flags
&= ~clear_flags
;
307 lwp
->lwp_ru
.oublock
++;
309 cpup
= CPU
; /* get pointer AFTER preemption is disabled */
310 CPU_STATS_ADDQ(cpup
, sys
, lwrite
, 1);
311 CPU_STATS_ADDQ(cpup
, sys
, bwrite
, 1);
312 do_wait
= ((flag
& B_ASYNC
) == 0 || force_wait
);
314 CPU_STATS_ADDQ(cpup
, sys
, bawrite
, 1);
316 if (ufsvfsp
== NULL
) {
317 (void) bdev_strategy(bp
);
318 } else if (ufsvfsp
->vfs_log
&& bio_lufs_strategy
!= NULL
) {
320 (*bio_lufs_strategy
)(ufsvfsp
->vfs_log
, bp
);
321 } else if (ufsvfsp
->vfs_snapshot
&& bio_snapshot_strategy
!= NULL
) {
322 /* ufs && snapshots */
323 (*bio_snapshot_strategy
)(&ufsvfsp
->vfs_snapshot
, bp
);
325 ub
.ub_bwrites
.value
.ul
++; /* ufs && !logging */
326 (void) bdev_strategy(bp
);
337 * Write the buffer, waiting for completion (unless B_ASYNC is set).
338 * Then release the buffer.
339 * This interface is provided for binary compatibility. Using
340 * BWRITE() directly avoids the extra function call overhead invoked
341 * by calling this routine.
344 bwrite(struct buf
*bp
)
350 * Write the buffer, waiting for completion.
351 * But don't release the buffer afterwards.
352 * This interface is provided for binary compatibility. Using
353 * BWRITE2() directly avoids the extra function call overhead.
356 bwrite2(struct buf
*bp
)
362 * Release the buffer, marking it so that if it is grabbed
363 * for another purpose it will be written out before being
364 * given up (e.g. when writing a partial block where it is
365 * assumed that another write for the same block will soon follow).
366 * Also save the time that the block is first marked as delayed
367 * so that it will be written in a reasonable time.
370 bdwrite(struct buf
*bp
)
372 ASSERT(SEMA_HELD(&bp
->b_sem
));
373 CPU_STATS_ADD_K(sys
, lwrite
, 1);
374 if ((bp
->b_flags
& B_DELWRI
) == 0)
375 bp
->b_start
= ddi_get_lbolt();
377 * B_DONE allows others to use the buffer, B_DELWRI causes the
378 * buffer to be written before being reused, and setting b_resid
379 * to zero says the buffer is complete.
381 bp
->b_flags
|= B_DELWRI
| B_DONE
;
387 * Release the buffer, start I/O on it, but don't wait for completion.
390 bawrite(struct buf
*bp
)
392 ASSERT(SEMA_HELD(&bp
->b_sem
));
394 /* Use bfreelist.b_bcount as a weird-ass heuristic */
395 if (bfreelist
.b_bcount
> 4)
396 bp
->b_flags
|= B_ASYNC
;
401 * Release the buffer, with no I/O implied.
404 brelse(struct buf
*bp
)
413 ASSERT(SEMA_HELD(&bp
->b_sem
));
416 * Clear the retry write flag if the buffer was written without
417 * error. The presence of B_DELWRI means the buffer has not yet
418 * been written and the presence of B_ERROR means that an error
419 * is still occurring.
421 if ((bp
->b_flags
& (B_ERROR
| B_DELWRI
| B_RETRYWRI
)) == B_RETRYWRI
) {
422 bp
->b_flags
&= ~B_RETRYWRI
;
425 /* Check for anomalous conditions */
426 if (bp
->b_flags
& (B_ERROR
|B_NOCACHE
)) {
427 if (bp
->b_flags
& B_NOCACHE
) {
428 /* Don't add to the freelist. Destroy it now */
429 kmem_free(bp
->b_un
.b_addr
, bp
->b_bufsize
);
430 sema_destroy(&bp
->b_sem
);
431 sema_destroy(&bp
->b_io
);
432 kmem_free(bp
, sizeof (struct buf
));
436 * If a write failed and we are supposed to retry write,
437 * don't toss the buffer. Keep it around and mark it
438 * delayed write in the hopes that it will eventually
439 * get flushed (and still keep the system running.)
441 if ((bp
->b_flags
& (B_READ
| B_RETRYWRI
)) == B_RETRYWRI
) {
442 bp
->b_flags
|= B_DELWRI
;
443 /* keep fsflush from trying continuously to flush */
444 bp
->b_start
= ddi_get_lbolt();
446 bp
->b_flags
|= B_AGE
|B_STALE
;
447 bp
->b_flags
&= ~B_ERROR
;
452 * If delayed write is set then put in on the delayed
453 * write list instead of the free buffer list.
455 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
456 hmp
= &hbuf
[index
].b_lock
;
460 dp
= (struct buf
*)hp
;
463 * Make sure that the number of entries on this list are
464 * Zero <= count <= total # buffers
466 ASSERT(hp
->b_length
>= 0);
467 ASSERT(hp
->b_length
< nbuf
);
469 hp
->b_length
++; /* We are adding this buffer */
471 if (bp
->b_flags
& B_DELWRI
) {
473 * This buffer goes on the delayed write buffer list
475 dp
= (struct buf
*)&dwbuf
[index
];
477 ASSERT(bp
->b_bufsize
> 0);
478 ASSERT(bp
->b_bcount
> 0);
479 ASSERT(bp
->b_un
.b_addr
!= NULL
);
481 if (bp
->b_flags
& B_AGE
) {
482 backp
= &dp
->av_forw
;
483 (*backp
)->av_back
= bp
;
484 bp
->av_forw
= *backp
;
488 backp
= &dp
->av_back
;
489 (*backp
)->av_forw
= bp
;
490 bp
->av_back
= *backp
;
496 if (bfreelist
.b_flags
& B_WANTED
) {
498 * Should come here very very rarely.
500 mutex_enter(&bfree_lock
);
501 if (bfreelist
.b_flags
& B_WANTED
) {
502 bfreelist
.b_flags
&= ~B_WANTED
;
503 cv_broadcast(&bio_mem_cv
);
505 mutex_exit(&bfree_lock
);
508 bp
->b_flags
&= ~(B_WANTED
|B_BUSY
|B_ASYNC
);
510 * Don't let anyone get the buffer off the freelist before we
511 * release our hold on it.
517 * Return a count of the number of B_BUSY buffers in the system
518 * Can only be used as a good estimate. If 'cleanit' is set,
519 * try to flush all bufs.
522 bio_busy(int cleanit
)
529 for (i
= 0; i
< v
.v_hbuf
; i
++) {
531 dp
= (struct buf
*)&hbuf
[i
];
532 hmp
= &hbuf
[i
].b_lock
;
535 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
536 if (bp
->b_flags
& B_BUSY
)
542 if (cleanit
&& busy
!= 0) {
550 * this interface is provided for binary compatibility.
552 * Assign a buffer for the given block. If the appropriate
553 * block is already associated, return it; otherwise search
554 * for the oldest non-busy buffer and reassign it.
557 getblk(dev_t dev
, daddr_t blkno
, long bsize
)
559 return (getblk_common(/* ufsvfsp */ NULL
, dev
,
560 blkno
, bsize
, /* errflg */ 0));
564 * Assign a buffer for the given block. If the appropriate
565 * block is already associated, return it; otherwise search
566 * for the oldest non-busy buffer and reassign it.
569 getblk_common(void * arg
, dev_t dev
, daddr_t blkno
, long bsize
, int errflg
)
571 ufsvfs_t
*ufsvfsp
= (struct ufsvfs
*)arg
;
574 struct buf
*nbp
= NULL
;
580 if (getmajor(dev
) >= devcnt
)
581 cmn_err(CE_PANIC
, "blkdev");
583 biostats
.bio_lookup
.value
.ui32
++;
585 index
= bio_bhash(dev
, blkno
);
587 dp
= (struct buf
*)hp
;
592 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
593 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
594 (bp
->b_flags
& B_STALE
))
597 * Avoid holding the hash lock in the event that
598 * the buffer is locked by someone. Since the hash chain
599 * may change when we drop the hash lock
600 * we have to start at the beginning of the chain if the
601 * buffer identity/contents aren't valid.
603 if (!sema_tryp(&bp
->b_sem
)) {
604 biostats
.bio_bufbusy
.value
.ui32
++;
607 * OK, we are dealing with a busy buffer.
608 * In the case that we are panicking and we
609 * got called from bread(), we have some chance
610 * for error recovery. So better bail out from
611 * here since sema_p() won't block. If we got
612 * called directly from ufs routines, there is
613 * no way to report an error yet.
615 if (panicstr
&& errflg
)
618 * For the following line of code to work
619 * correctly never kmem_free the buffer "header".
622 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
623 (bp
->b_flags
& B_STALE
)) {
626 goto loop
; /* start over */
631 biostats
.bio_hit
.value
.ui32
++;
632 bp
->b_flags
&= ~B_AGE
;
635 * Yank it off the free/delayed write lists
641 ASSERT((bp
->b_flags
& B_NOCACHE
) == NULL
);
645 * Make the common path short.
647 ASSERT(SEMA_HELD(&bp
->b_sem
));
651 biostats
.bio_bufdup
.value
.ui32
++;
654 * The buffer must have entered during the lock upgrade
655 * so free the new buffer we allocated and return the
658 kmem_free(nbp
->b_un
.b_addr
, nbp
->b_bufsize
);
659 nbp
->b_un
.b_addr
= NULL
;
662 * Account for the memory
664 mutex_enter(&bfree_lock
);
665 bfreelist
.b_bufsize
+= nbp
->b_bufsize
;
666 mutex_exit(&bfree_lock
);
669 * Destroy buf identity, and place on avail list
671 nbp
->b_dev
= (o_dev_t
)NODEV
;
680 ASSERT(SEMA_HELD(&bp
->b_sem
));
685 * bio_getfreeblk may block so check the hash chain again.
689 nbp
= bio_getfreeblk(bsize
);
695 * New buffer. Assign nbp and stick it on the hash.
697 nbp
->b_flags
= B_BUSY
;
699 nbp
->b_dev
= (o_dev_t
)cmpdev(dev
);
700 nbp
->b_blkno
= blkno
;
701 nbp
->b_iodone
= NULL
;
702 nbp
->b_bcount
= bsize
;
704 * If we are given a ufsvfsp and the vfs_root field is NULL
705 * then this must be I/O for a superblock. A superblock's
706 * buffer is set up in mountfs() and there is no root vnode
709 if (ufsvfsp
&& ufsvfsp
->vfs_root
) {
710 nbp
->b_vp
= ufsvfsp
->vfs_root
;
715 ASSERT((nbp
->b_flags
& B_NOCACHE
) == NULL
);
720 ASSERT(SEMA_HELD(&nbp
->b_sem
));
726 * Come here in case of an internal error. At this point we couldn't
727 * get a buffer, but he have to return one. Hence we allocate some
728 * kind of error reply buffer on the fly. This buffer is marked as
729 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
730 * - B_ERROR will indicate error to the caller.
731 * - B_DONE will prevent us from reading the buffer from
733 * - B_NOCACHE will cause that this buffer gets free'd in
739 sema_p(&errbp
->b_sem
);
740 errbp
->b_flags
&= ~B_BUSY
;
741 errbp
->b_flags
|= (B_ERROR
| B_DONE
);
746 * Get an empty block, not assigned to any particular device.
747 * Returns a locked buffer that is not on any hash or free list.
754 bp
= kmem_alloc(sizeof (struct buf
), KM_SLEEP
);
756 bp
->av_forw
= bp
->av_back
= NULL
;
757 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_SLEEP
);
758 bp
->b_bufsize
= bsize
;
759 bp
->b_flags
= B_BUSY
| B_NOCACHE
| B_AGE
;
760 bp
->b_dev
= (o_dev_t
)NODEV
;
763 bp
->b_bcount
= bsize
;
769 * Interface of geteblk() is kept intact to maintain driver compatibility.
770 * Use ngeteblk() to allocate block size other than 1 KB.
775 return (ngeteblk((long)1024));
779 * Return a buffer w/o sleeping
782 trygetblk(dev_t dev
, daddr_t blkno
)
790 index
= bio_bhash(dev
, blkno
);
794 if (!mutex_tryenter(hmp
))
797 dp
= (struct buf
*)hp
;
798 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
799 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
800 (bp
->b_flags
& B_STALE
))
803 * Get access to a valid buffer without sleeping
805 if (sema_tryp(&bp
->b_sem
)) {
806 if (bp
->b_flags
& B_DONE
) {
823 * Wait for I/O completion on the buffer; return errors
827 iowait(struct buf
*bp
)
829 ASSERT(SEMA_HELD(&bp
->b_sem
));
830 return (biowait(bp
));
834 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
835 * and wake up anyone waiting for it.
838 iodone(struct buf
*bp
)
840 ASSERT(SEMA_HELD(&bp
->b_sem
));
845 * Zero the core associated with a buffer.
848 clrbuf(struct buf
*bp
)
850 ASSERT(SEMA_HELD(&bp
->b_sem
));
851 bzero(bp
->b_un
.b_addr
, bp
->b_bcount
);
857 * Make sure all write-behind blocks on dev (or NODEV for all)
865 struct buf
*delwri_list
= EMPTY_LIST
;
869 mutex_enter(&blist_lock
);
871 * Wait for any invalidates or flushes ahead of us to finish.
872 * We really could split blist_lock up per device for better
875 while (bio_doinginval
|| bio_doingflush
) {
876 bio_flinv_cv_wanted
= 1;
877 cv_wait(&bio_flushinval_cv
, &blist_lock
);
881 * Gather all B_DELWRI buffer for device.
882 * Lock ordering is b_sem > hash lock (brelse).
883 * Since we are finding the buffer via the delayed write list,
884 * it may be busy and we would block trying to get the
885 * b_sem lock while holding hash lock. So transfer all the
886 * candidates on the delwri_list and then drop the hash locks.
888 for (i
= 0; i
< v
.v_hbuf
; i
++) {
890 hmp
= &hbuf
[i
].b_lock
;
891 dp
= (struct buf
*)&dwbuf
[i
];
893 for (bp
= dp
->av_forw
; bp
!= dp
; bp
= bp
->av_forw
) {
894 if (dev
== NODEV
|| bp
->b_edev
== dev
) {
895 if (bp
->b_list
== NULL
) {
896 bp
->b_list
= delwri_list
;
903 mutex_exit(&blist_lock
);
906 * Now that the hash locks have been dropped grab the semaphores
907 * and write back all the buffers that have B_DELWRI set.
909 while (delwri_list
!= EMPTY_LIST
) {
913 sema_p(&bp
->b_sem
); /* may block */
914 if ((dev
!= bp
->b_edev
&& dev
!= NODEV
) ||
915 (panicstr
&& bp
->b_flags
& B_BUSY
)) {
917 delwri_list
= bp
->b_list
;
919 continue; /* No longer a candidate */
921 if (bp
->b_flags
& B_DELWRI
) {
922 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
925 dp
= (struct buf
*)hp
;
927 bp
->b_flags
|= B_ASYNC
;
932 if (bp
->b_vp
== NULL
) { /* !ufs */
935 UFS_BWRITE(VTOI(bp
->b_vp
)->i_ufsvfs
, bp
);
940 delwri_list
= bp
->b_list
;
943 mutex_enter(&blist_lock
);
945 if (bio_flinv_cv_wanted
) {
946 bio_flinv_cv_wanted
= 0;
947 cv_broadcast(&bio_flushinval_cv
);
949 mutex_exit(&blist_lock
);
953 * Ensure that a specified block is up-to-date on disk.
956 blkflush(dev_t dev
, daddr_t blkno
)
960 struct buf
*sbp
= NULL
;
964 index
= bio_bhash(dev
, blkno
);
966 dp
= (struct buf
*)hp
;
970 * Identify the buffer in the cache belonging to
971 * this device and blkno (if any).
974 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
975 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
976 (bp
->b_flags
& B_STALE
))
985 * Now check the buffer we have identified and
986 * make sure it still belongs to the device and is B_DELWRI
989 if (sbp
->b_blkno
== blkno
&& sbp
->b_edev
== dev
&&
990 (sbp
->b_flags
& (B_DELWRI
|B_STALE
)) == B_DELWRI
) {
996 * XXX - There is nothing to guarantee a synchronous
997 * write here if the B_ASYNC flag is set. This needs
998 * some investigation.
1000 if (sbp
->b_vp
== NULL
) { /* !ufs */
1001 BWRITE(sbp
); /* synchronous write */
1003 UFS_BWRITE(VTOI(sbp
->b_vp
)->i_ufsvfs
, sbp
);
1006 sema_v(&sbp
->b_sem
);
1011 * Same as binval, except can force-invalidate delayed-write buffers
1012 * (which are not be already flushed because of device errors). Also
1013 * makes sure that the retry write flag is cleared.
1016 bfinval(dev_t dev
, int force
)
1020 struct buf
*binval_list
= EMPTY_LIST
;
1026 mutex_enter(&blist_lock
);
1028 * Wait for any flushes ahead of us to finish, it's ok to
1029 * do invalidates in parallel.
1031 while (bio_doingflush
) {
1032 bio_flinv_cv_wanted
= 1;
1033 cv_wait(&bio_flushinval_cv
, &blist_lock
);
1038 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1039 dp
= (struct buf
*)&hbuf
[i
];
1040 hmp
= &hbuf
[i
].b_lock
;
1043 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1044 if (bp
->b_edev
== dev
) {
1045 if (bp
->b_list
== NULL
) {
1046 bp
->b_list
= binval_list
;
1053 mutex_exit(&blist_lock
);
1055 /* Invalidate all bp's found */
1056 while (binval_list
!= EMPTY_LIST
) {
1060 if (bp
->b_edev
== dev
) {
1061 if (force
&& (bp
->b_flags
& B_DELWRI
)) {
1062 /* clear B_DELWRI, move to non-dw freelist */
1063 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
1064 hmp
= &hbuf
[index
].b_lock
;
1065 dp
= (struct buf
*)&hbuf
[index
];
1068 /* remove from delayed write freelist */
1071 /* add to B_AGE side of non-dw freelist */
1072 backp
= &dp
->av_forw
;
1073 (*backp
)->av_back
= bp
;
1074 bp
->av_forw
= *backp
;
1079 * make sure write retries and busy are cleared
1082 ~(B_BUSY
| B_DELWRI
| B_RETRYWRI
);
1085 if ((bp
->b_flags
& B_DELWRI
) == 0)
1086 bp
->b_flags
|= B_STALE
|B_AGE
;
1091 binval_list
= bp
->b_list
;
1094 mutex_enter(&blist_lock
);
1096 if (bio_flinv_cv_wanted
) {
1097 cv_broadcast(&bio_flushinval_cv
);
1098 bio_flinv_cv_wanted
= 0;
1100 mutex_exit(&blist_lock
);
1105 * If possible, invalidate blocks for a dev on demand
1110 (void) bfinval(dev
, 0);
1114 * Initialize the buffer I/O system by freeing
1115 * all buffers and setting all device hash buffer lists to empty.
1121 unsigned int i
, pct
;
1122 ulong_t bio_max_hwm
, bio_default_hwm
;
1125 * Maximum/Default values for bufhwm are set to the smallest of:
1126 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1127 * - 1/4 of kernel virtual memory
1128 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1129 * Additionally, in order to allow simple tuning by percentage of
1130 * physical memory, bufhwm_pct is used to calculate the default if
1131 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1133 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1134 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1136 bio_max_hwm
= MIN(physmem
/ BIO_MAX_PERCENT
,
1137 btop(vmem_size(heap_arena
, VMEM_FREE
)) / 4) * (PAGESIZE
/ 1024);
1138 bio_max_hwm
= MIN(INT32_MAX
, bio_max_hwm
);
1140 pct
= BIO_BUF_PERCENT
;
1141 if (bufhwm_pct
!= 0 &&
1142 ((pct
= 100 / bufhwm_pct
) < BIO_MAX_PERCENT
)) {
1143 pct
= BIO_BUF_PERCENT
;
1145 * Invalid user specified value, emit a warning.
1147 cmn_err(CE_WARN
, "binit: bufhwm_pct(%d) out of \
1148 range(1..%d). Using %d as default.",
1150 100 / BIO_MAX_PERCENT
, 100 / BIO_BUF_PERCENT
);
1153 bio_default_hwm
= MIN(physmem
/ pct
,
1154 btop(vmem_size(heap_arena
, VMEM_FREE
)) / 4) * (PAGESIZE
/ 1024);
1155 bio_default_hwm
= MIN(INT32_MAX
, bio_default_hwm
);
1157 if ((v
.v_bufhwm
= bufhwm
) == 0)
1158 v
.v_bufhwm
= bio_default_hwm
;
1160 if (v
.v_bufhwm
< BIO_MIN_HWM
|| v
.v_bufhwm
> bio_max_hwm
) {
1161 v
.v_bufhwm
= (int)bio_max_hwm
;
1163 * Invalid user specified value, emit a warning.
1166 "binit: bufhwm(%d) out \
1167 of range(%d..%lu). Using %lu as default",
1169 BIO_MIN_HWM
, bio_max_hwm
, bio_max_hwm
);
1173 * Determine the number of hash buckets. Default is to
1174 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1175 * Round up number to the next power of 2.
1177 v
.v_hbuf
= 1 << highbit((((ulong_t
)v
.v_bufhwm
* 1024) / MAXBSIZE
) /
1179 v
.v_hmask
= v
.v_hbuf
- 1;
1180 v
.v_buf
= BIO_BHDR_POOL
;
1182 hbuf
= kmem_zalloc(v
.v_hbuf
* sizeof (struct hbuf
), KM_SLEEP
);
1184 dwbuf
= kmem_zalloc(v
.v_hbuf
* sizeof (struct dwbuf
), KM_SLEEP
);
1186 bfreelist
.b_bufsize
= (size_t)v
.v_bufhwm
* 1024;
1188 bp
->b_forw
= bp
->b_back
= bp
->av_forw
= bp
->av_back
= bp
;
1190 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1191 hbuf
[i
].b_forw
= hbuf
[i
].b_back
= (struct buf
*)&hbuf
[i
];
1192 hbuf
[i
].av_forw
= hbuf
[i
].av_back
= (struct buf
*)&hbuf
[i
];
1195 * Initialize the delayed write buffer list.
1197 dwbuf
[i
].b_forw
= dwbuf
[i
].b_back
= (struct buf
*)&dwbuf
[i
];
1198 dwbuf
[i
].av_forw
= dwbuf
[i
].av_back
= (struct buf
*)&dwbuf
[i
];
1203 * Wait for I/O completion on the buffer; return error code.
1204 * If bp was for synchronous I/O, bp is invalid and associated
1205 * resources are freed on return.
1208 biowait(struct buf
*bp
)
1213 ASSERT(SEMA_HELD(&bp
->b_sem
));
1216 atomic_inc_64(&cpup
->cpu_stats
.sys
.iowait
);
1217 DTRACE_IO1(wait__start
, struct buf
*, bp
);
1220 * In case of panic, busy wait for completion
1223 while ((bp
->b_flags
& B_DONE
) == 0)
1228 DTRACE_IO1(wait__done
, struct buf
*, bp
);
1229 atomic_dec_64(&cpup
->cpu_stats
.sys
.iowait
);
1231 error
= geterror(bp
);
1232 if ((bp
->b_flags
& B_ASYNC
) == 0) {
1233 if (bp
->b_flags
& B_REMAPPED
)
1240 biodone_tnf_probe(struct buf
*bp
)
1243 TNF_PROBE_3(biodone
, "io blockio", /* CSTYLED */,
1244 tnf_device
, device
, bp
->b_edev
,
1245 tnf_diskaddr
, block
, bp
->b_lblkno
,
1246 tnf_opaque
, buf
, bp
);
1250 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1251 * and wake up anyone waiting for it.
1254 biodone(struct buf
*bp
)
1256 if (bp
->b_flags
& B_STARTED
) {
1257 DTRACE_IO1(done
, struct buf
*, bp
);
1258 bp
->b_flags
&= ~B_STARTED
;
1262 * Call the TNF probe here instead of the inline code
1263 * to force our compiler to use the tail call optimization.
1265 biodone_tnf_probe(bp
);
1267 if (bp
->b_iodone
!= NULL
) {
1268 (*(bp
->b_iodone
))(bp
);
1271 ASSERT((bp
->b_flags
& B_DONE
) == 0);
1272 ASSERT(SEMA_HELD(&bp
->b_sem
));
1273 bp
->b_flags
|= B_DONE
;
1274 if (bp
->b_flags
& B_ASYNC
) {
1275 if (bp
->b_flags
& (B_PAGEIO
|B_REMAPPED
))
1276 bio_pageio_done(bp
);
1278 brelse(bp
); /* release bp to freelist */
1285 * Pick up the device's error number and pass it to the user;
1286 * if there is an error but the number is 0 set a generalized code.
1289 geterror(struct buf
*bp
)
1293 ASSERT(SEMA_HELD(&bp
->b_sem
));
1294 if (bp
->b_flags
& B_ERROR
) {
1295 error
= bp
->b_error
;
1303 * Support for pageio buffers.
1305 * This stuff should be generalized to provide a generalized bp
1306 * header facility that can be used for things other than pageio.
1310 * Allocate and initialize a buf struct for use with pageio.
1313 pageio_setup(struct page
*pp
, size_t len
, struct vnode
*vp
, int flags
)
1318 if (flags
& B_READ
) {
1319 CPU_STATS_ENTER_K();
1320 cpup
= CPU
; /* get pointer AFTER preemption is disabled */
1321 CPU_STATS_ADDQ(cpup
, vm
, pgin
, 1);
1322 CPU_STATS_ADDQ(cpup
, vm
, pgpgin
, btopr(len
));
1323 if ((flags
& B_ASYNC
) == 0) {
1324 klwp_t
*lwp
= ttolwp(curthread
);
1326 lwp
->lwp_ru
.majflt
++;
1327 CPU_STATS_ADDQ(cpup
, vm
, maj_fault
, 1);
1329 TNF_PROBE_2(major_fault
, "vm pagefault", /* CSTYLED */,
1330 tnf_opaque
, vnode
, pp
->p_vnode
,
1331 tnf_offset
, offset
, pp
->p_offset
);
1334 * Update statistics for pages being paged in
1336 if (pp
!= NULL
&& pp
->p_vnode
!= NULL
) {
1337 if (IS_SWAPFSVP(pp
->p_vnode
)) {
1338 CPU_STATS_ADDQ(cpup
, vm
, anonpgin
, btopr(len
));
1340 if (pp
->p_vnode
->v_flag
& VVMEXEC
) {
1341 CPU_STATS_ADDQ(cpup
, vm
, execpgin
,
1344 CPU_STATS_ADDQ(cpup
, vm
, fspgin
,
1350 TRACE_1(TR_FAC_VM
, TR_PAGE_WS_IN
,
1351 "page_ws_in:pp %p", pp
);
1353 TNF_PROBE_3(pagein
, "vm pageio io", /* CSTYLED */,
1354 tnf_opaque
, vnode
, pp
->p_vnode
,
1355 tnf_offset
, offset
, pp
->p_offset
,
1356 tnf_size
, size
, len
);
1359 bp
= kmem_zalloc(sizeof (struct buf
), KM_SLEEP
);
1361 bp
->b_bufsize
= len
;
1363 bp
->b_flags
= B_PAGEIO
| B_NOCACHE
| B_BUSY
| flags
;
1365 sema_init(&bp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1367 /* Initialize bp->b_sem in "locked" state */
1368 sema_init(&bp
->b_sem
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1372 THREAD_KPRI_RELEASE_N(btopr(len
)); /* release kpri from page_locks */
1375 * Caller sets dev & blkno and can adjust
1376 * b_addr for page offset and can use bp_mapin
1377 * to make pages kernel addressable.
1383 pageio_done(struct buf
*bp
)
1385 ASSERT(SEMA_HELD(&bp
->b_sem
));
1386 if (bp
->b_flags
& B_REMAPPED
)
1390 ASSERT((bp
->b_flags
& B_NOCACHE
) != 0);
1392 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1393 sema_destroy(&bp
->b_sem
);
1394 sema_destroy(&bp
->b_io
);
1395 kmem_free(bp
, sizeof (struct buf
));
1399 * Check to see whether the buffers, except the one pointed by sbp,
1400 * associated with the device are busy.
1401 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1404 bcheck(dev_t dev
, struct buf
*sbp
)
1412 * check for busy bufs for this filesystem
1414 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1415 dp
= (struct buf
*)&hbuf
[i
];
1416 hmp
= &hbuf
[i
].b_lock
;
1419 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1421 * if buf is busy or dirty, then filesystem is busy
1423 if ((bp
->b_edev
== dev
) &&
1424 ((bp
->b_flags
& B_STALE
) == 0) &&
1425 (bp
->b_flags
& (B_DELWRI
|B_BUSY
)) &&
1437 * Hash two 32 bit entities.
1440 hash2ints(int x
, int y
)
1445 hash
= ((hash
* 7) + (x
>> 8)) - 1;
1446 hash
= ((hash
* 7) + (x
>> 16)) - 1;
1447 hash
= ((hash
* 7) + (x
>> 24)) - 1;
1448 hash
= ((hash
* 7) + y
) - 1;
1449 hash
= ((hash
* 7) + (y
>> 8)) - 1;
1450 hash
= ((hash
* 7) + (y
>> 16)) - 1;
1451 hash
= ((hash
* 7) + (y
>> 24)) - 1;
1458 * Return a new buffer struct.
1459 * Create a new buffer if we haven't gone over our high water
1460 * mark for memory, otherwise try to get one off the freelist.
1462 * Returns a locked buf that has no id and is not on any hash or free
1466 bio_getfreeblk(long bsize
)
1468 struct buf
*bp
, *dp
;
1474 * mutex_enter(&bfree_lock);
1475 * bfreelist.b_bufsize represents the amount of memory
1476 * mutex_exit(&bfree_lock); protect ref to bfreelist
1477 * we are allowed to allocate in the cache before we hit our hwm.
1479 bio_mem_get(bsize
); /* Account for our memory request */
1482 bp
= bio_bhdr_alloc(); /* Get a buf hdr */
1483 sema_p(&bp
->b_sem
); /* Should never fail */
1485 ASSERT(bp
->b_un
.b_addr
== NULL
);
1486 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_NOSLEEP
);
1487 if (bp
->b_un
.b_addr
!= NULL
) {
1489 * Make the common path short
1491 bp
->b_bufsize
= bsize
;
1492 ASSERT(SEMA_HELD(&bp
->b_sem
));
1497 save
= bp
; /* Save bp we allocated */
1498 start
= end
= lastindex
;
1500 biostats
.bio_bufwant
.value
.ui32
++;
1503 * Memory isn't available from the system now. Scan
1504 * the hash buckets till enough space is found.
1509 dp
= (struct buf
*)hp
;
1518 if (!sema_tryp(&bp
->b_sem
)) {
1524 * Since we are going down the freelist
1525 * associated with this hash bucket the
1526 * B_DELWRI flag should not be set.
1528 ASSERT(!(bp
->b_flags
& B_DELWRI
));
1530 if (bp
->b_bufsize
== bsize
) {
1537 * Didn't kmem_alloc any more, so don't
1540 mutex_enter(&bfree_lock
);
1541 bfreelist
.b_bufsize
+= bsize
;
1542 mutex_exit(&bfree_lock
);
1545 * Update the lastindex value.
1550 * Put our saved bp back on the list
1552 sema_v(&save
->b_sem
);
1553 bio_bhdr_free(save
);
1554 ASSERT(SEMA_HELD(&bp
->b_sem
));
1561 start
= ((start
+ 1) % v
.v_hbuf
);
1562 } while (start
!= end
);
1564 biostats
.bio_bufwait
.value
.ui32
++;
1565 bp
= save
; /* Use original bp */
1566 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_SLEEP
);
1569 bp
->b_bufsize
= bsize
;
1570 ASSERT(SEMA_HELD(&bp
->b_sem
));
1575 * Allocate a buffer header. If none currently available, allocate
1579 bio_bhdr_alloc(void)
1581 struct buf
*dp
, *sdp
;
1586 mutex_enter(&bhdr_lock
);
1587 if (bhdrlist
!= NULL
) {
1589 bhdrlist
= bp
->av_forw
;
1590 mutex_exit(&bhdr_lock
);
1594 mutex_exit(&bhdr_lock
);
1597 * Need to allocate a new pool. If the system is currently
1598 * out of memory, then try freeing things on the freelist.
1600 dp
= kmem_zalloc(sizeof (struct buf
) * v
.v_buf
, KM_NOSLEEP
);
1603 * System can't give us a pool of headers, try
1604 * recycling from the free lists.
1606 bio_recycle(BIO_HEADER
, 0);
1609 for (i
= 0; i
< v
.v_buf
; i
++, dp
++) {
1611 * The next two lines are needed since NODEV
1612 * is -1 and not NULL
1614 dp
->b_dev
= (o_dev_t
)NODEV
;
1616 dp
->av_forw
= dp
+ 1;
1617 sema_init(&dp
->b_sem
, 1, NULL
, SEMA_DEFAULT
,
1619 sema_init(&dp
->b_io
, 0, NULL
, SEMA_DEFAULT
,
1623 mutex_enter(&bhdr_lock
);
1624 (--dp
)->av_forw
= bhdrlist
; /* Fix last pointer */
1628 bhdrlist
= bp
->av_forw
;
1629 mutex_exit(&bhdr_lock
);
1638 bio_bhdr_free(struct buf
*bp
)
1640 ASSERT(bp
->b_back
== NULL
);
1641 ASSERT(bp
->b_forw
== NULL
);
1642 ASSERT(bp
->av_back
== NULL
);
1643 ASSERT(bp
->av_forw
== NULL
);
1644 ASSERT(bp
->b_un
.b_addr
== NULL
);
1645 ASSERT(bp
->b_dev
== (o_dev_t
)NODEV
);
1646 ASSERT(bp
->b_edev
== NODEV
);
1647 ASSERT(bp
->b_flags
== 0);
1649 mutex_enter(&bhdr_lock
);
1650 bp
->av_forw
= bhdrlist
;
1652 mutex_exit(&bhdr_lock
);
1656 * If we haven't gone over the high water mark, it's o.k. to
1657 * allocate more buffer space, otherwise recycle buffers
1658 * from the freelist until enough memory is free for a bsize request.
1660 * We account for this memory, even though
1661 * we don't allocate it here.
1664 bio_mem_get(long bsize
)
1666 mutex_enter(&bfree_lock
);
1667 if (bfreelist
.b_bufsize
> bsize
) {
1668 bfreelist
.b_bufsize
-= bsize
;
1669 mutex_exit(&bfree_lock
);
1672 mutex_exit(&bfree_lock
);
1673 bio_recycle(BIO_MEM
, bsize
);
1677 * flush a list of delayed write buffers.
1678 * (currently used only by bio_recycle below.)
1681 bio_flushlist(struct buf
*delwri_list
)
1685 while (delwri_list
!= EMPTY_LIST
) {
1687 bp
->b_flags
|= B_AGE
| B_ASYNC
;
1688 if (bp
->b_vp
== NULL
) { /* !ufs */
1691 UFS_BWRITE(VTOI(bp
->b_vp
)->i_ufsvfs
, bp
);
1693 delwri_list
= bp
->b_list
;
1699 * Start recycling buffers on the freelist for one of 2 reasons:
1700 * - we need a buffer header
1701 * - we need to free up memory
1702 * Once started we continue to recycle buffers until the B_AGE
1706 bio_recycle(int want
, long bsize
)
1708 struct buf
*bp
, *dp
, *dwp
, *nbp
;
1713 struct buf
*delwri_list
= EMPTY_LIST
;
1719 start
= end
= lastindex
;
1723 dp
= (struct buf
*)hp
;
1732 if (!sema_tryp(&bp
->b_sem
)) {
1737 * Do we really want to nuke all of the B_AGE stuff??
1739 if ((bp
->b_flags
& B_AGE
) == 0 && found
) {
1743 return; /* All done */
1746 ASSERT(MUTEX_HELD(&hp
->b_lock
));
1747 ASSERT(!(bp
->b_flags
& B_DELWRI
));
1752 * Remove bhdr from cache, free up memory,
1753 * and add the hdr to the freelist.
1758 if (bp
->b_bufsize
) {
1759 kmem_free(bp
->b_un
.b_addr
, bp
->b_bufsize
);
1760 bp
->b_un
.b_addr
= NULL
;
1761 mutex_enter(&bfree_lock
);
1762 bfreelist
.b_bufsize
+= bp
->b_bufsize
;
1763 mutex_exit(&bfree_lock
);
1766 bp
->b_dev
= (o_dev_t
)NODEV
;
1771 if (want
== BIO_HEADER
) {
1774 ASSERT(want
== BIO_MEM
);
1775 if (!found
&& bfreelist
.b_bufsize
>= bsize
) {
1776 /* Account for the memory we want */
1777 mutex_enter(&bfree_lock
);
1778 if (bfreelist
.b_bufsize
>= bsize
) {
1779 bfreelist
.b_bufsize
-= bsize
;
1782 mutex_exit(&bfree_lock
);
1787 * Since we dropped hmp start from the
1796 * Look at the delayed write list.
1797 * First gather into a private list, then write them.
1799 dwp
= (struct buf
*)&dwbuf
[start
];
1800 mutex_enter(&blist_lock
);
1803 for (bp
= dwp
->av_forw
; bp
!= dwp
; bp
= nbp
) {
1808 if (!sema_tryp(&bp
->b_sem
))
1810 ASSERT(bp
->b_flags
& B_DELWRI
);
1812 * Do we really want to nuke all of the B_AGE stuff??
1815 if ((bp
->b_flags
& B_AGE
) == 0 && found
) {
1819 mutex_exit(&blist_lock
);
1820 bio_flushlist(delwri_list
);
1821 mutex_enter(&blist_lock
);
1823 if (bio_flinv_cv_wanted
) {
1824 bio_flinv_cv_wanted
= 0;
1825 cv_broadcast(&bio_flushinval_cv
);
1827 mutex_exit(&blist_lock
);
1828 return; /* All done */
1832 * If the buffer is already on a flush or
1833 * invalidate list then just skip it.
1835 if (bp
->b_list
!= NULL
) {
1840 * We are still on the same bucket.
1844 bp
->b_list
= delwri_list
;
1848 mutex_exit(&blist_lock
);
1849 bio_flushlist(delwri_list
);
1850 delwri_list
= EMPTY_LIST
;
1851 mutex_enter(&blist_lock
);
1853 if (bio_flinv_cv_wanted
) {
1854 bio_flinv_cv_wanted
= 0;
1855 cv_broadcast(&bio_flushinval_cv
);
1857 mutex_exit(&blist_lock
);
1858 start
= (start
+ 1) % v
.v_hbuf
;
1860 } while (start
!= end
);
1866 * Free lists exhausted and we haven't satisfied the request.
1867 * Wait here for more entries to be added to freelist.
1868 * Because this might have just happened, make it timed.
1870 mutex_enter(&bfree_lock
);
1871 bfreelist
.b_flags
|= B_WANTED
;
1872 (void) cv_reltimedwait(&bio_mem_cv
, &bfree_lock
, hz
, TR_CLOCK_TICK
);
1873 mutex_exit(&bfree_lock
);
1878 * See if the block is associated with some buffer
1879 * (mainly to avoid getting hung up on a wait in breada).
1882 bio_incore(dev_t dev
, daddr_t blkno
)
1889 index
= bio_bhash(dev
, blkno
);
1890 dp
= (struct buf
*)&hbuf
[index
];
1891 hmp
= &hbuf
[index
].b_lock
;
1894 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1895 if (bp
->b_blkno
== blkno
&& bp
->b_edev
== dev
&&
1896 (bp
->b_flags
& B_STALE
) == 0) {
1906 bio_pageio_done(struct buf
*bp
)
1908 if (bp
->b_flags
& B_PAGEIO
) {
1910 if (bp
->b_flags
& B_REMAPPED
)
1913 if (bp
->b_flags
& B_READ
)
1914 pvn_read_done(bp
->b_pages
, bp
->b_flags
);
1916 pvn_write_done(bp
->b_pages
, B_WRITE
| bp
->b_flags
);
1919 ASSERT(bp
->b_flags
& B_REMAPPED
);
1926 * bioerror(9F) - indicate error in buffer header
1927 * If 'error' is zero, remove the error indication.
1930 bioerror(struct buf
*bp
, int error
)
1934 ASSERT(SEMA_HELD(&bp
->b_sem
));
1937 bp
->b_flags
|= B_ERROR
;
1939 bp
->b_flags
&= ~B_ERROR
;
1941 bp
->b_error
= error
;
1945 * bioreset(9F) - reuse a private buffer header after I/O is complete
1948 bioreset(struct buf
*bp
)
1957 * biosize(9F) - return size of a buffer header
1962 return (sizeof (struct buf
));
1966 * biomodified(9F) - check if buffer is modified
1969 biomodified(struct buf
*bp
)
1977 if ((bp
->b_flags
& B_PAGEIO
) == 0) {
1981 npf
= btopr(bp
->b_bcount
+ ((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
));
1984 ppattr
= hat_pagesync(pp
, HAT_SYNC_DONTZERO
|
1985 HAT_SYNC_STOPON_MOD
);
1996 * bioinit(9F) - initialize a buffer structure
1999 bioinit(struct buf
*bp
)
2001 bzero(bp
, sizeof (struct buf
));
2002 sema_init(&bp
->b_sem
, 0, NULL
, SEMA_DEFAULT
, NULL
);
2003 sema_init(&bp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
2008 * biofini(9F) - uninitialize a buffer structure
2011 biofini(struct buf
*bp
)
2013 sema_destroy(&bp
->b_io
);
2014 sema_destroy(&bp
->b_sem
);
2018 * bioclone(9F) - clone a buffer
2021 bioclone(struct buf
*bp
, off_t off
, size_t len
, dev_t dev
, daddr_t blkno
,
2022 int (*iodone
)(struct buf
*), struct buf
*bp_mem
, int sleep
)
2027 if (bp_mem
== NULL
) {
2028 bufp
= kmem_alloc(sizeof (struct buf
), sleep
);
2038 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2042 * The cloned buffer does not inherit the B_REMAPPED flag.
2044 bufp
->b_flags
= (bp
->b_flags
& BUF_CLONE_FLAGS
) | B_BUSY
;
2045 bufp
->b_bcount
= len
;
2046 bufp
->b_blkno
= blkno
;
2047 bufp
->b_iodone
= iodone
;
2048 bufp
->b_proc
= bp
->b_proc
;
2050 bufp
->b_file
= bp
->b_file
;
2051 bufp
->b_offset
= bp
->b_offset
;
2053 if (bp
->b_flags
& B_SHADOW
) {
2054 ASSERT(bp
->b_shadow
);
2055 ASSERT(bp
->b_flags
& B_PHYS
);
2057 bufp
->b_shadow
= bp
->b_shadow
+
2058 btop(((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
) + off
);
2059 bufp
->b_un
.b_addr
= (caddr_t
)((uintptr_t)bp
->b_un
.b_addr
+ off
);
2060 if (bp
->b_flags
& B_REMAPPED
)
2061 bufp
->b_proc
= NULL
;
2063 if (bp
->b_flags
& B_PAGEIO
) {
2069 o
= ((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
) + off
;
2070 for (i
= btop(o
); i
> 0; i
--) {
2074 bufp
->b_un
.b_addr
= (caddr_t
)(o
& PAGEOFFSET
);
2077 (caddr_t
)((uintptr_t)bp
->b_un
.b_addr
+ off
);
2078 if (bp
->b_flags
& B_REMAPPED
)
2079 bufp
->b_proc
= NULL
;