4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2011 Joyent, Inc. All rights reserved.
28 * Copyright (c) 2016 by Delphix. All rights reserved.
31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
32 /* All Rights Reserved */
35 * University Copyright- Copyright (c) 1982, 1986, 1988
36 * The Regents of the University of California
39 * University Acknowledgment- Portions of this document are derived from
40 * software developed by the University of California, Berkeley, and its
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/sysmacros.h>
48 #include <sys/cpuvar.h>
49 #include <sys/errno.h>
50 #include <sys/debug.h>
53 #include <sys/vnode.h>
54 #include <sys/bitmap.h>
55 #include <sys/cmn_err.h>
58 #include <sys/atomic.h>
59 #include <vm/seg_kmem.h>
62 #include <sys/vtrace.h>
63 #include <sys/fs/ufs_inode.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_log.h>
66 #include <sys/systm.h>
71 static kmutex_t blist_lock
; /* protects b_list */
72 static kmutex_t bhdr_lock
; /* protects the bhdrlist */
73 static kmutex_t bfree_lock
; /* protects the bfreelist structure */
75 struct hbuf
*hbuf
; /* Hash buckets */
76 struct dwbuf
*dwbuf
; /* Delayed write buckets */
77 static struct buf
*bhdrlist
; /* buf header free list */
78 static int nbuf
; /* number of buffer headers allocated */
80 static int lastindex
; /* Reference point on where to start */
81 /* when looking for free buffers */
83 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
84 #define EMPTY_LIST ((struct buf *)-1)
86 static kcondvar_t bio_mem_cv
; /* Condition variables */
87 static kcondvar_t bio_flushinval_cv
;
88 static int bio_doingflush
; /* flush in progress */
89 static int bio_doinginval
; /* inval in progress */
90 static int bio_flinv_cv_wanted
; /* someone waiting for cv */
93 * Statistics on the buffer cache
95 struct biostats biostats
= {
96 { "buffer_cache_lookups", KSTAT_DATA_UINT32
},
97 { "buffer_cache_hits", KSTAT_DATA_UINT32
},
98 { "new_buffer_requests", KSTAT_DATA_UINT32
},
99 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32
},
100 { "buffers_locked_by_someone", KSTAT_DATA_UINT32
},
101 { "duplicate_buffers_found", KSTAT_DATA_UINT32
}
107 kstat_named_t
*biostats_ptr
= (kstat_named_t
*)&biostats
;
108 uint_t biostats_ndata
= (uint_t
)(sizeof (biostats
) /
109 sizeof (kstat_named_t
));
112 * Statistics on ufs buffer cache
113 * Not protected by locks
115 struct ufsbiostats ub
= {
116 { "breads", KSTAT_DATA_UINT32
},
117 { "bwrites", KSTAT_DATA_UINT32
},
118 { "fbiwrites", KSTAT_DATA_UINT32
},
119 { "getpages", KSTAT_DATA_UINT32
},
120 { "getras", KSTAT_DATA_UINT32
},
121 { "putsyncs", KSTAT_DATA_UINT32
},
122 { "putasyncs", KSTAT_DATA_UINT32
},
123 { "putpageios", KSTAT_DATA_UINT32
},
127 * more UFS Logging eccentricities...
129 * required since "#pragma weak ..." doesn't work in reverse order.
130 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
131 * to ufs routines don't get plugged into bio.c calls so
132 * we initialize it when setting up the "lufsops" table
133 * in "lufs.c:_init()"
135 void (*bio_lufs_strategy
)(void *, buf_t
*);
136 void (*bio_snapshot_strategy
)(void *, buf_t
*);
139 /* Private routines */
140 static struct buf
*bio_getfreeblk(long);
141 static void bio_mem_get(long);
142 static void bio_bhdr_free(struct buf
*);
143 static struct buf
*bio_bhdr_alloc(void);
144 static void bio_recycle(int, long);
145 static void bio_pageio_done(struct buf
*);
146 static int bio_incore(dev_t
, daddr_t
);
149 * Buffer cache constants
151 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
152 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
153 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
154 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
155 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
156 #define BIO_HASHLEN 4 /* Target length of hash chains */
159 /* Flags for bio_recycle() */
160 #define BIO_HEADER 0x01
163 extern int bufhwm
; /* User tunable - high water mark for mem */
164 extern int bufhwm_pct
; /* ditto - given in % of physmem */
167 * The following routines allocate and free
168 * buffers with various side effects. In general the
169 * arguments to an allocate routine are a device and
170 * a block number, and the value is a pointer to
171 * to the buffer header; the buffer returned is locked with a
172 * binary semaphore so that no one else can touch it. If the block was
173 * already in core, no I/O need be done; if it is
174 * already locked, the process waits until it becomes free.
175 * The following routines allocate a buffer:
179 * Eventually the buffer must be released, possibly with the
180 * side effect of writing it out, by using one of
181 * bwrite/BWRITE/brwrite
186 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
187 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
188 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
189 * B_DONE is still used to denote a buffer with I/O complete on it.
191 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
192 * should not be used where a very accurate count of the free buffers is
197 * Read in (if necessary) the block and return a buffer pointer.
199 * This interface is provided for binary compatibility. Using
200 * BREAD() directly avoids the extra function call overhead invoked
201 * by calling this routine.
204 bread(dev_t dev
, daddr_t blkno
, long bsize
)
206 return (BREAD(dev
, blkno
, bsize
));
210 * Common code for reading a buffer with various options
212 * Read in (if necessary) the block and return a buffer pointer.
215 bread_common(void *arg
, dev_t dev
, daddr_t blkno
, long bsize
)
217 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)arg
;
219 klwp_t
*lwp
= ttolwp(curthread
);
221 CPU_STATS_ADD_K(sys
, lread
, 1);
222 bp
= getblk_common(ufsvfsp
, dev
, blkno
, bsize
, /* errflg */ 1);
223 if (bp
->b_flags
& B_DONE
)
225 bp
->b_flags
|= B_READ
;
226 ASSERT(bp
->b_bcount
== bsize
);
227 if (ufsvfsp
== NULL
) { /* !ufs */
228 (void) bdev_strategy(bp
);
229 } else if (ufsvfsp
->vfs_log
&& bio_lufs_strategy
!= NULL
) {
231 (*bio_lufs_strategy
)(ufsvfsp
->vfs_log
, bp
);
232 } else if (ufsvfsp
->vfs_snapshot
&& bio_snapshot_strategy
!= NULL
) {
233 /* ufs && snapshots */
234 (*bio_snapshot_strategy
)(&ufsvfsp
->vfs_snapshot
, bp
);
236 ufsvfsp
->vfs_iotstamp
= ddi_get_lbolt();
237 ub
.ub_breads
.value
.ul
++; /* ufs && !logging */
238 (void) bdev_strategy(bp
);
241 lwp
->lwp_ru
.inblock
++;
242 CPU_STATS_ADD_K(sys
, bread
, 1);
248 * Read in the block, like bread, but also start I/O on the
249 * read-ahead block (which is not allocated to the caller).
252 breada(dev_t dev
, daddr_t blkno
, daddr_t rablkno
, long bsize
)
254 struct buf
*bp
, *rabp
;
255 klwp_t
*lwp
= ttolwp(curthread
);
258 if (!bio_incore(dev
, blkno
)) {
259 CPU_STATS_ADD_K(sys
, lread
, 1);
260 bp
= GETBLK(dev
, blkno
, bsize
);
261 if ((bp
->b_flags
& B_DONE
) == 0) {
262 bp
->b_flags
|= B_READ
;
263 bp
->b_bcount
= bsize
;
264 (void) bdev_strategy(bp
);
266 lwp
->lwp_ru
.inblock
++;
267 CPU_STATS_ADD_K(sys
, bread
, 1);
270 if (rablkno
&& bfreelist
.b_bcount
> 1 &&
271 !bio_incore(dev
, rablkno
)) {
272 rabp
= GETBLK(dev
, rablkno
, bsize
);
273 if (rabp
->b_flags
& B_DONE
)
276 rabp
->b_flags
|= B_READ
|B_ASYNC
;
277 rabp
->b_bcount
= bsize
;
278 (void) bdev_strategy(rabp
);
280 lwp
->lwp_ru
.inblock
++;
281 CPU_STATS_ADD_K(sys
, bread
, 1);
285 return (BREAD(dev
, blkno
, bsize
));
291 * Common code for writing a buffer with various options.
293 * force_wait - wait for write completion regardless of B_ASYNC flag
294 * do_relse - release the buffer when we are done
295 * clear_flags - flags to clear from the buffer
298 bwrite_common(void *arg
, struct buf
*bp
, int force_wait
,
299 int do_relse
, int clear_flags
)
301 register int do_wait
;
302 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)arg
;
304 klwp_t
*lwp
= ttolwp(curthread
);
307 ASSERT(SEMA_HELD(&bp
->b_sem
));
309 bp
->b_flags
&= ~clear_flags
;
311 lwp
->lwp_ru
.oublock
++;
313 cpup
= CPU
; /* get pointer AFTER preemption is disabled */
314 CPU_STATS_ADDQ(cpup
, sys
, lwrite
, 1);
315 CPU_STATS_ADDQ(cpup
, sys
, bwrite
, 1);
316 do_wait
= ((flag
& B_ASYNC
) == 0 || force_wait
);
318 CPU_STATS_ADDQ(cpup
, sys
, bawrite
, 1);
320 if (ufsvfsp
== NULL
) {
321 (void) bdev_strategy(bp
);
322 } else if (ufsvfsp
->vfs_log
&& bio_lufs_strategy
!= NULL
) {
324 (*bio_lufs_strategy
)(ufsvfsp
->vfs_log
, bp
);
325 } else if (ufsvfsp
->vfs_snapshot
&& bio_snapshot_strategy
!= NULL
) {
326 /* ufs && snapshots */
327 (*bio_snapshot_strategy
)(&ufsvfsp
->vfs_snapshot
, bp
);
329 ub
.ub_bwrites
.value
.ul
++; /* ufs && !logging */
330 (void) bdev_strategy(bp
);
341 * Write the buffer, waiting for completion (unless B_ASYNC is set).
342 * Then release the buffer.
343 * This interface is provided for binary compatibility. Using
344 * BWRITE() directly avoids the extra function call overhead invoked
345 * by calling this routine.
348 bwrite(struct buf
*bp
)
354 * Write the buffer, waiting for completion.
355 * But don't release the buffer afterwards.
356 * This interface is provided for binary compatibility. Using
357 * BWRITE2() directly avoids the extra function call overhead.
360 bwrite2(struct buf
*bp
)
366 * Release the buffer, marking it so that if it is grabbed
367 * for another purpose it will be written out before being
368 * given up (e.g. when writing a partial block where it is
369 * assumed that another write for the same block will soon follow).
370 * Also save the time that the block is first marked as delayed
371 * so that it will be written in a reasonable time.
374 bdwrite(struct buf
*bp
)
376 ASSERT(SEMA_HELD(&bp
->b_sem
));
377 CPU_STATS_ADD_K(sys
, lwrite
, 1);
378 if ((bp
->b_flags
& B_DELWRI
) == 0)
379 bp
->b_start
= ddi_get_lbolt();
381 * B_DONE allows others to use the buffer, B_DELWRI causes the
382 * buffer to be written before being reused, and setting b_resid
383 * to zero says the buffer is complete.
385 bp
->b_flags
|= B_DELWRI
| B_DONE
;
391 * Release the buffer, start I/O on it, but don't wait for completion.
394 bawrite(struct buf
*bp
)
396 ASSERT(SEMA_HELD(&bp
->b_sem
));
398 /* Use bfreelist.b_bcount as a weird-ass heuristic */
399 if (bfreelist
.b_bcount
> 4)
400 bp
->b_flags
|= B_ASYNC
;
405 * Release the buffer, with no I/O implied.
408 brelse(struct buf
*bp
)
417 ASSERT(SEMA_HELD(&bp
->b_sem
));
420 * Clear the retry write flag if the buffer was written without
421 * error. The presence of B_DELWRI means the buffer has not yet
422 * been written and the presence of B_ERROR means that an error
423 * is still occurring.
425 if ((bp
->b_flags
& (B_ERROR
| B_DELWRI
| B_RETRYWRI
)) == B_RETRYWRI
) {
426 bp
->b_flags
&= ~B_RETRYWRI
;
429 /* Check for anomalous conditions */
430 if (bp
->b_flags
& (B_ERROR
|B_NOCACHE
)) {
431 if (bp
->b_flags
& B_NOCACHE
) {
432 /* Don't add to the freelist. Destroy it now */
433 kmem_free(bp
->b_un
.b_addr
, bp
->b_bufsize
);
434 sema_destroy(&bp
->b_sem
);
435 sema_destroy(&bp
->b_io
);
436 kmem_free(bp
, sizeof (struct buf
));
440 * If a write failed and we are supposed to retry write,
441 * don't toss the buffer. Keep it around and mark it
442 * delayed write in the hopes that it will eventually
443 * get flushed (and still keep the system running.)
445 if ((bp
->b_flags
& (B_READ
| B_RETRYWRI
)) == B_RETRYWRI
) {
446 bp
->b_flags
|= B_DELWRI
;
447 /* keep fsflush from trying continuously to flush */
448 bp
->b_start
= ddi_get_lbolt();
450 bp
->b_flags
|= B_AGE
|B_STALE
;
451 bp
->b_flags
&= ~B_ERROR
;
456 * If delayed write is set then put in on the delayed
457 * write list instead of the free buffer list.
459 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
460 hmp
= &hbuf
[index
].b_lock
;
464 dp
= (struct buf
*)hp
;
467 * Make sure that the number of entries on this list are
468 * Zero <= count <= total # buffers
470 ASSERT(hp
->b_length
>= 0);
471 ASSERT(hp
->b_length
< nbuf
);
473 hp
->b_length
++; /* We are adding this buffer */
475 if (bp
->b_flags
& B_DELWRI
) {
477 * This buffer goes on the delayed write buffer list
479 dp
= (struct buf
*)&dwbuf
[index
];
481 ASSERT(bp
->b_bufsize
> 0);
482 ASSERT(bp
->b_bcount
> 0);
483 ASSERT(bp
->b_un
.b_addr
!= NULL
);
485 if (bp
->b_flags
& B_AGE
) {
486 backp
= &dp
->av_forw
;
487 (*backp
)->av_back
= bp
;
488 bp
->av_forw
= *backp
;
492 backp
= &dp
->av_back
;
493 (*backp
)->av_forw
= bp
;
494 bp
->av_back
= *backp
;
500 if (bfreelist
.b_flags
& B_WANTED
) {
502 * Should come here very very rarely.
504 mutex_enter(&bfree_lock
);
505 if (bfreelist
.b_flags
& B_WANTED
) {
506 bfreelist
.b_flags
&= ~B_WANTED
;
507 cv_broadcast(&bio_mem_cv
);
509 mutex_exit(&bfree_lock
);
512 bp
->b_flags
&= ~(B_WANTED
|B_BUSY
|B_ASYNC
);
514 * Don't let anyone get the buffer off the freelist before we
515 * release our hold on it.
521 * Return a count of the number of B_BUSY buffers in the system
522 * Can only be used as a good estimate. If 'cleanit' is set,
523 * try to flush all bufs.
526 bio_busy(int cleanit
)
533 for (i
= 0; i
< v
.v_hbuf
; i
++) {
534 dp
= (struct buf
*)&hbuf
[i
];
535 hmp
= &hbuf
[i
].b_lock
;
538 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
539 if (bp
->b_flags
& B_BUSY
)
545 if (cleanit
&& busy
!= 0) {
553 * this interface is provided for binary compatibility.
555 * Assign a buffer for the given block. If the appropriate
556 * block is already associated, return it; otherwise search
557 * for the oldest non-busy buffer and reassign it.
560 getblk(dev_t dev
, daddr_t blkno
, long bsize
)
562 return (getblk_common(/* ufsvfsp */ NULL
, dev
,
563 blkno
, bsize
, /* errflg */ 0));
567 * Assign a buffer for the given block. If the appropriate
568 * block is already associated, return it; otherwise search
569 * for the oldest non-busy buffer and reassign it.
572 getblk_common(void * arg
, dev_t dev
, daddr_t blkno
, long bsize
, int errflg
)
574 ufsvfs_t
*ufsvfsp
= (struct ufsvfs
*)arg
;
577 struct buf
*nbp
= NULL
;
583 if (getmajor(dev
) >= devcnt
)
584 cmn_err(CE_PANIC
, "blkdev");
586 biostats
.bio_lookup
.value
.ui32
++;
588 index
= bio_bhash(dev
, blkno
);
590 dp
= (struct buf
*)hp
;
595 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
596 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
597 (bp
->b_flags
& B_STALE
))
600 * Avoid holding the hash lock in the event that
601 * the buffer is locked by someone. Since the hash chain
602 * may change when we drop the hash lock
603 * we have to start at the beginning of the chain if the
604 * buffer identity/contents aren't valid.
606 if (!sema_tryp(&bp
->b_sem
)) {
607 biostats
.bio_bufbusy
.value
.ui32
++;
610 * OK, we are dealing with a busy buffer.
611 * In the case that we are panicking and we
612 * got called from bread(), we have some chance
613 * for error recovery. So better bail out from
614 * here since sema_p() won't block. If we got
615 * called directly from ufs routines, there is
616 * no way to report an error yet.
618 if (panicstr
&& errflg
)
621 * For the following line of code to work
622 * correctly never kmem_free the buffer "header".
625 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
626 (bp
->b_flags
& B_STALE
)) {
629 goto loop
; /* start over */
634 biostats
.bio_hit
.value
.ui32
++;
635 bp
->b_flags
&= ~B_AGE
;
638 * Yank it off the free/delayed write lists
644 ASSERT((bp
->b_flags
& B_NOCACHE
) == 0);
648 * Make the common path short.
650 ASSERT(SEMA_HELD(&bp
->b_sem
));
654 biostats
.bio_bufdup
.value
.ui32
++;
657 * The buffer must have entered during the lock upgrade
658 * so free the new buffer we allocated and return the
661 kmem_free(nbp
->b_un
.b_addr
, nbp
->b_bufsize
);
662 nbp
->b_un
.b_addr
= NULL
;
665 * Account for the memory
667 mutex_enter(&bfree_lock
);
668 bfreelist
.b_bufsize
+= nbp
->b_bufsize
;
669 mutex_exit(&bfree_lock
);
672 * Destroy buf identity, and place on avail list
674 nbp
->b_dev
= (o_dev_t
)NODEV
;
683 ASSERT(SEMA_HELD(&bp
->b_sem
));
688 * bio_getfreeblk may block so check the hash chain again.
692 nbp
= bio_getfreeblk(bsize
);
698 * New buffer. Assign nbp and stick it on the hash.
700 nbp
->b_flags
= B_BUSY
;
702 nbp
->b_dev
= (o_dev_t
)cmpdev(dev
);
703 nbp
->b_blkno
= blkno
;
704 nbp
->b_iodone
= NULL
;
705 nbp
->b_bcount
= bsize
;
707 * If we are given a ufsvfsp and the vfs_root field is NULL
708 * then this must be I/O for a superblock. A superblock's
709 * buffer is set up in mountfs() and there is no root vnode
712 if (ufsvfsp
&& ufsvfsp
->vfs_root
) {
713 nbp
->b_vp
= ufsvfsp
->vfs_root
;
718 ASSERT((nbp
->b_flags
& B_NOCACHE
) == 0);
723 ASSERT(SEMA_HELD(&nbp
->b_sem
));
729 * Come here in case of an internal error. At this point we couldn't
730 * get a buffer, but we have to return one. Hence we allocate some
731 * kind of error reply buffer on the fly. This buffer is marked as
732 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
733 * - B_ERROR will indicate error to the caller.
734 * - B_DONE will prevent us from reading the buffer from
736 * - B_NOCACHE will cause that this buffer gets free'd in
742 sema_p(&errbp
->b_sem
);
743 errbp
->b_flags
&= ~B_BUSY
;
744 errbp
->b_flags
|= (B_ERROR
| B_DONE
);
749 * Get an empty block, not assigned to any particular device.
750 * Returns a locked buffer that is not on any hash or free list.
757 bp
= kmem_alloc(sizeof (struct buf
), KM_SLEEP
);
759 bp
->av_forw
= bp
->av_back
= NULL
;
760 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_SLEEP
);
761 bp
->b_bufsize
= bsize
;
762 bp
->b_flags
= B_BUSY
| B_NOCACHE
| B_AGE
;
763 bp
->b_dev
= (o_dev_t
)NODEV
;
766 bp
->b_bcount
= bsize
;
772 * Interface of geteblk() is kept intact to maintain driver compatibility.
773 * Use ngeteblk() to allocate block size other than 1 KB.
778 return (ngeteblk((long)1024));
782 * Return a buffer w/o sleeping
785 trygetblk(dev_t dev
, daddr_t blkno
)
793 index
= bio_bhash(dev
, blkno
);
797 if (!mutex_tryenter(hmp
))
800 dp
= (struct buf
*)hp
;
801 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
802 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
803 (bp
->b_flags
& B_STALE
))
806 * Get access to a valid buffer without sleeping
808 if (sema_tryp(&bp
->b_sem
)) {
809 if (bp
->b_flags
& B_DONE
) {
826 * Wait for I/O completion on the buffer; return errors
830 iowait(struct buf
*bp
)
832 ASSERT(SEMA_HELD(&bp
->b_sem
));
833 return (biowait(bp
));
837 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
838 * and wake up anyone waiting for it.
841 iodone(struct buf
*bp
)
843 ASSERT(SEMA_HELD(&bp
->b_sem
));
848 * Zero the core associated with a buffer.
851 clrbuf(struct buf
*bp
)
853 ASSERT(SEMA_HELD(&bp
->b_sem
));
854 bzero(bp
->b_un
.b_addr
, bp
->b_bcount
);
860 * Make sure all write-behind blocks on dev (or NODEV for all)
868 struct buf
*delwri_list
= EMPTY_LIST
;
872 mutex_enter(&blist_lock
);
874 * Wait for any invalidates or flushes ahead of us to finish.
875 * We really could split blist_lock up per device for better
878 while (bio_doinginval
|| bio_doingflush
) {
879 bio_flinv_cv_wanted
= 1;
880 cv_wait(&bio_flushinval_cv
, &blist_lock
);
884 * Gather all B_DELWRI buffer for device.
885 * Lock ordering is b_sem > hash lock (brelse).
886 * Since we are finding the buffer via the delayed write list,
887 * it may be busy and we would block trying to get the
888 * b_sem lock while holding hash lock. So transfer all the
889 * candidates on the delwri_list and then drop the hash locks.
891 for (i
= 0; i
< v
.v_hbuf
; i
++) {
892 hmp
= &hbuf
[i
].b_lock
;
893 dp
= (struct buf
*)&dwbuf
[i
];
895 for (bp
= dp
->av_forw
; bp
!= dp
; bp
= bp
->av_forw
) {
896 if (dev
== NODEV
|| bp
->b_edev
== dev
) {
897 if (bp
->b_list
== NULL
) {
898 bp
->b_list
= delwri_list
;
905 mutex_exit(&blist_lock
);
908 * Now that the hash locks have been dropped grab the semaphores
909 * and write back all the buffers that have B_DELWRI set.
911 while (delwri_list
!= EMPTY_LIST
) {
914 sema_p(&bp
->b_sem
); /* may block */
915 if ((dev
!= bp
->b_edev
&& dev
!= NODEV
) ||
916 (panicstr
&& bp
->b_flags
& B_BUSY
)) {
918 delwri_list
= bp
->b_list
;
920 continue; /* No longer a candidate */
922 if (bp
->b_flags
& B_DELWRI
) {
923 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
926 dp
= (struct buf
*)hp
;
928 bp
->b_flags
|= B_ASYNC
;
933 if (bp
->b_vp
== NULL
) { /* !ufs */
936 UFS_BWRITE(VTOI(bp
->b_vp
)->i_ufsvfs
, bp
);
941 delwri_list
= bp
->b_list
;
944 mutex_enter(&blist_lock
);
946 if (bio_flinv_cv_wanted
) {
947 bio_flinv_cv_wanted
= 0;
948 cv_broadcast(&bio_flushinval_cv
);
950 mutex_exit(&blist_lock
);
954 * Ensure that a specified block is up-to-date on disk.
957 blkflush(dev_t dev
, daddr_t blkno
)
961 struct buf
*sbp
= NULL
;
965 index
= bio_bhash(dev
, blkno
);
967 dp
= (struct buf
*)hp
;
971 * Identify the buffer in the cache belonging to
972 * this device and blkno (if any).
975 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
976 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
977 (bp
->b_flags
& B_STALE
))
986 * Now check the buffer we have identified and
987 * make sure it still belongs to the device and is B_DELWRI
990 if (sbp
->b_blkno
== blkno
&& sbp
->b_edev
== dev
&&
991 (sbp
->b_flags
& (B_DELWRI
|B_STALE
)) == B_DELWRI
) {
997 * XXX - There is nothing to guarantee a synchronous
998 * write here if the B_ASYNC flag is set. This needs
999 * some investigation.
1001 if (sbp
->b_vp
== NULL
) { /* !ufs */
1002 BWRITE(sbp
); /* synchronous write */
1004 UFS_BWRITE(VTOI(sbp
->b_vp
)->i_ufsvfs
, sbp
);
1007 sema_v(&sbp
->b_sem
);
1012 * Same as binval, except can force-invalidate delayed-write buffers
1013 * (which are not be already flushed because of device errors). Also
1014 * makes sure that the retry write flag is cleared.
1017 bfinval(dev_t dev
, int force
)
1021 struct buf
*binval_list
= EMPTY_LIST
;
1027 mutex_enter(&blist_lock
);
1029 * Wait for any flushes ahead of us to finish, it's ok to
1030 * do invalidates in parallel.
1032 while (bio_doingflush
) {
1033 bio_flinv_cv_wanted
= 1;
1034 cv_wait(&bio_flushinval_cv
, &blist_lock
);
1039 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1040 dp
= (struct buf
*)&hbuf
[i
];
1041 hmp
= &hbuf
[i
].b_lock
;
1044 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1045 if (bp
->b_edev
== dev
) {
1046 if (bp
->b_list
== NULL
) {
1047 bp
->b_list
= binval_list
;
1054 mutex_exit(&blist_lock
);
1056 /* Invalidate all bp's found */
1057 while (binval_list
!= EMPTY_LIST
) {
1061 if (bp
->b_edev
== dev
) {
1062 if (force
&& (bp
->b_flags
& B_DELWRI
)) {
1063 /* clear B_DELWRI, move to non-dw freelist */
1064 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
1065 hmp
= &hbuf
[index
].b_lock
;
1066 dp
= (struct buf
*)&hbuf
[index
];
1069 /* remove from delayed write freelist */
1072 /* add to B_AGE side of non-dw freelist */
1073 backp
= &dp
->av_forw
;
1074 (*backp
)->av_back
= bp
;
1075 bp
->av_forw
= *backp
;
1080 * make sure write retries and busy are cleared
1083 ~(B_BUSY
| B_DELWRI
| B_RETRYWRI
);
1086 if ((bp
->b_flags
& B_DELWRI
) == 0)
1087 bp
->b_flags
|= B_STALE
|B_AGE
;
1092 binval_list
= bp
->b_list
;
1095 mutex_enter(&blist_lock
);
1097 if (bio_flinv_cv_wanted
) {
1098 cv_broadcast(&bio_flushinval_cv
);
1099 bio_flinv_cv_wanted
= 0;
1101 mutex_exit(&blist_lock
);
1106 * If possible, invalidate blocks for a dev on demand
1111 (void) bfinval(dev
, 0);
1115 * Initialize the buffer I/O system by freeing
1116 * all buffers and setting all device hash buffer lists to empty.
1122 unsigned int i
, pct
;
1123 ulong_t bio_max_hwm
, bio_default_hwm
;
1126 * Maximum/Default values for bufhwm are set to the smallest of:
1127 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1128 * - 1/4 of kernel virtual memory
1129 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1130 * Additionally, in order to allow simple tuning by percentage of
1131 * physical memory, bufhwm_pct is used to calculate the default if
1132 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1134 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1135 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1137 bio_max_hwm
= MIN(physmem
/ BIO_MAX_PERCENT
,
1138 btop(vmem_size(heap_arena
, VMEM_FREE
)) / 4) * (PAGESIZE
/ 1024);
1139 bio_max_hwm
= MIN(INT32_MAX
, bio_max_hwm
);
1141 pct
= BIO_BUF_PERCENT
;
1142 if (bufhwm_pct
!= 0 &&
1143 ((pct
= 100 / bufhwm_pct
) < BIO_MAX_PERCENT
)) {
1144 pct
= BIO_BUF_PERCENT
;
1146 * Invalid user specified value, emit a warning.
1148 cmn_err(CE_WARN
, "binit: bufhwm_pct(%d) out of \
1149 range(1..%d). Using %d as default.",
1151 100 / BIO_MAX_PERCENT
, 100 / BIO_BUF_PERCENT
);
1154 bio_default_hwm
= MIN(physmem
/ pct
,
1155 btop(vmem_size(heap_arena
, VMEM_FREE
)) / 4) * (PAGESIZE
/ 1024);
1156 bio_default_hwm
= MIN(INT32_MAX
, bio_default_hwm
);
1158 if ((v
.v_bufhwm
= bufhwm
) == 0)
1159 v
.v_bufhwm
= bio_default_hwm
;
1161 if (v
.v_bufhwm
< BIO_MIN_HWM
|| v
.v_bufhwm
> bio_max_hwm
) {
1162 v
.v_bufhwm
= (int)bio_max_hwm
;
1164 * Invalid user specified value, emit a warning.
1167 "binit: bufhwm(%d) out \
1168 of range(%d..%lu). Using %lu as default",
1170 BIO_MIN_HWM
, bio_max_hwm
, bio_max_hwm
);
1174 * Determine the number of hash buckets. Default is to
1175 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1176 * Round up number to the next power of 2.
1178 v
.v_hbuf
= 1 << highbit((((ulong_t
)v
.v_bufhwm
* 1024) / MAXBSIZE
) /
1180 v
.v_hmask
= v
.v_hbuf
- 1;
1181 v
.v_buf
= BIO_BHDR_POOL
;
1183 hbuf
= kmem_zalloc(v
.v_hbuf
* sizeof (struct hbuf
), KM_SLEEP
);
1185 dwbuf
= kmem_zalloc(v
.v_hbuf
* sizeof (struct dwbuf
), KM_SLEEP
);
1187 bfreelist
.b_bufsize
= (size_t)v
.v_bufhwm
* 1024;
1189 bp
->b_forw
= bp
->b_back
= bp
->av_forw
= bp
->av_back
= bp
;
1191 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1192 hbuf
[i
].b_forw
= hbuf
[i
].b_back
= (struct buf
*)&hbuf
[i
];
1193 hbuf
[i
].av_forw
= hbuf
[i
].av_back
= (struct buf
*)&hbuf
[i
];
1196 * Initialize the delayed write buffer list.
1198 dwbuf
[i
].b_forw
= dwbuf
[i
].b_back
= (struct buf
*)&dwbuf
[i
];
1199 dwbuf
[i
].av_forw
= dwbuf
[i
].av_back
= (struct buf
*)&dwbuf
[i
];
1204 * Wait for I/O completion on the buffer; return error code.
1205 * If bp was for synchronous I/O, bp is invalid and associated
1206 * resources are freed on return.
1209 biowait(struct buf
*bp
)
1214 ASSERT(SEMA_HELD(&bp
->b_sem
));
1217 atomic_inc_64(&cpup
->cpu_stats
.sys
.iowait
);
1218 DTRACE_IO1(wait__start
, struct buf
*, bp
);
1221 * In case of panic, busy wait for completion
1224 while ((bp
->b_flags
& B_DONE
) == 0)
1229 DTRACE_IO1(wait__done
, struct buf
*, bp
);
1230 atomic_dec_64(&cpup
->cpu_stats
.sys
.iowait
);
1232 error
= geterror(bp
);
1233 if ((bp
->b_flags
& B_ASYNC
) == 0) {
1234 if (bp
->b_flags
& B_REMAPPED
)
1241 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1242 * and wake up anyone waiting for it.
1245 biodone(struct buf
*bp
)
1247 if (bp
->b_flags
& B_STARTED
) {
1248 DTRACE_IO1(done
, struct buf
*, bp
);
1249 bp
->b_flags
&= ~B_STARTED
;
1252 if (bp
->b_iodone
!= NULL
) {
1253 (*(bp
->b_iodone
))(bp
);
1256 ASSERT((bp
->b_flags
& B_DONE
) == 0);
1257 ASSERT(SEMA_HELD(&bp
->b_sem
));
1258 bp
->b_flags
|= B_DONE
;
1259 if (bp
->b_flags
& B_ASYNC
) {
1260 if (bp
->b_flags
& (B_PAGEIO
|B_REMAPPED
))
1261 bio_pageio_done(bp
);
1263 brelse(bp
); /* release bp to freelist */
1270 * Pick up the device's error number and pass it to the user;
1271 * if there is an error but the number is 0 set a generalized code.
1274 geterror(struct buf
*bp
)
1278 ASSERT(SEMA_HELD(&bp
->b_sem
));
1279 if (bp
->b_flags
& B_ERROR
) {
1280 error
= bp
->b_error
;
1288 * Support for pageio buffers.
1290 * This stuff should be generalized to provide a generalized bp
1291 * header facility that can be used for things other than pageio.
1295 * Allocate and initialize a buf struct for use with pageio.
1298 pageio_setup(struct page
*pp
, size_t len
, struct vnode
*vp
, int flags
)
1303 if (flags
& B_READ
) {
1304 CPU_STATS_ENTER_K();
1305 cpup
= CPU
; /* get pointer AFTER preemption is disabled */
1306 CPU_STATS_ADDQ(cpup
, vm
, pgin
, 1);
1307 CPU_STATS_ADDQ(cpup
, vm
, pgpgin
, btopr(len
));
1309 atomic_add_64(&curzone
->zone_pgpgin
, btopr(len
));
1311 if ((flags
& B_ASYNC
) == 0) {
1312 klwp_t
*lwp
= ttolwp(curthread
);
1314 lwp
->lwp_ru
.majflt
++;
1315 CPU_STATS_ADDQ(cpup
, vm
, maj_fault
, 1);
1318 * Update statistics for pages being paged in
1320 if (pp
!= NULL
&& pp
->p_vnode
!= NULL
) {
1321 if (IS_SWAPFSVP(pp
->p_vnode
)) {
1322 CPU_STATS_ADDQ(cpup
, vm
, anonpgin
, btopr(len
));
1323 atomic_add_64(&curzone
->zone_anonpgin
,
1326 if (pp
->p_vnode
->v_flag
& VVMEXEC
) {
1327 CPU_STATS_ADDQ(cpup
, vm
, execpgin
,
1329 atomic_add_64(&curzone
->zone_execpgin
,
1332 CPU_STATS_ADDQ(cpup
, vm
, fspgin
,
1334 atomic_add_64(&curzone
->zone_fspgin
,
1340 TRACE_1(TR_FAC_VM
, TR_PAGE_WS_IN
,
1341 "page_ws_in:pp %p", pp
);
1344 bp
= kmem_zalloc(sizeof (struct buf
), KM_SLEEP
);
1346 bp
->b_bufsize
= len
;
1348 bp
->b_flags
= B_PAGEIO
| B_NOCACHE
| B_BUSY
| flags
;
1350 sema_init(&bp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1352 /* Initialize bp->b_sem in "locked" state */
1353 sema_init(&bp
->b_sem
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1357 THREAD_KPRI_RELEASE_N(btopr(len
)); /* release kpri from page_locks */
1360 * Caller sets dev & blkno and can adjust
1361 * b_addr for page offset and can use bp_mapin
1362 * to make pages kernel addressable.
1368 pageio_done(struct buf
*bp
)
1370 ASSERT(SEMA_HELD(&bp
->b_sem
));
1371 if (bp
->b_flags
& B_REMAPPED
)
1375 ASSERT((bp
->b_flags
& B_NOCACHE
) != 0);
1377 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1378 sema_destroy(&bp
->b_sem
);
1379 sema_destroy(&bp
->b_io
);
1380 kmem_free(bp
, sizeof (struct buf
));
1384 * Check to see whether the buffers, except the one pointed by sbp,
1385 * associated with the device are busy.
1386 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1389 bcheck(dev_t dev
, struct buf
*sbp
)
1397 * check for busy bufs for this filesystem
1399 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1400 dp
= (struct buf
*)&hbuf
[i
];
1401 hmp
= &hbuf
[i
].b_lock
;
1404 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1406 * if buf is busy or dirty, then filesystem is busy
1408 if ((bp
->b_edev
== dev
) &&
1409 ((bp
->b_flags
& B_STALE
) == 0) &&
1410 (bp
->b_flags
& (B_DELWRI
|B_BUSY
)) &&
1422 * Hash two 32 bit entities.
1425 hash2ints(int x
, int y
)
1430 hash
= ((hash
* 7) + (x
>> 8)) - 1;
1431 hash
= ((hash
* 7) + (x
>> 16)) - 1;
1432 hash
= ((hash
* 7) + (x
>> 24)) - 1;
1433 hash
= ((hash
* 7) + y
) - 1;
1434 hash
= ((hash
* 7) + (y
>> 8)) - 1;
1435 hash
= ((hash
* 7) + (y
>> 16)) - 1;
1436 hash
= ((hash
* 7) + (y
>> 24)) - 1;
1443 * Return a new buffer struct.
1444 * Create a new buffer if we haven't gone over our high water
1445 * mark for memory, otherwise try to get one off the freelist.
1447 * Returns a locked buf that has no id and is not on any hash or free
1451 bio_getfreeblk(long bsize
)
1453 struct buf
*bp
, *dp
;
1459 * mutex_enter(&bfree_lock);
1460 * bfreelist.b_bufsize represents the amount of memory
1461 * mutex_exit(&bfree_lock); protect ref to bfreelist
1462 * we are allowed to allocate in the cache before we hit our hwm.
1464 bio_mem_get(bsize
); /* Account for our memory request */
1467 bp
= bio_bhdr_alloc(); /* Get a buf hdr */
1468 sema_p(&bp
->b_sem
); /* Should never fail */
1470 ASSERT(bp
->b_un
.b_addr
== NULL
);
1471 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_NOSLEEP
);
1472 if (bp
->b_un
.b_addr
!= NULL
) {
1474 * Make the common path short
1476 bp
->b_bufsize
= bsize
;
1477 ASSERT(SEMA_HELD(&bp
->b_sem
));
1482 save
= bp
; /* Save bp we allocated */
1483 start
= end
= lastindex
;
1485 biostats
.bio_bufwant
.value
.ui32
++;
1488 * Memory isn't available from the system now. Scan
1489 * the hash buckets till enough space is found.
1494 dp
= (struct buf
*)hp
;
1503 if (!sema_tryp(&bp
->b_sem
)) {
1509 * Since we are going down the freelist
1510 * associated with this hash bucket the
1511 * B_DELWRI flag should not be set.
1513 ASSERT(!(bp
->b_flags
& B_DELWRI
));
1515 if (bp
->b_bufsize
== bsize
) {
1522 * Didn't kmem_alloc any more, so don't
1525 mutex_enter(&bfree_lock
);
1526 bfreelist
.b_bufsize
+= bsize
;
1527 mutex_exit(&bfree_lock
);
1530 * Update the lastindex value.
1535 * Put our saved bp back on the list
1537 sema_v(&save
->b_sem
);
1538 bio_bhdr_free(save
);
1539 ASSERT(SEMA_HELD(&bp
->b_sem
));
1546 start
= ((start
+ 1) % v
.v_hbuf
);
1547 } while (start
!= end
);
1549 biostats
.bio_bufwait
.value
.ui32
++;
1550 bp
= save
; /* Use original bp */
1551 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_SLEEP
);
1554 bp
->b_bufsize
= bsize
;
1555 ASSERT(SEMA_HELD(&bp
->b_sem
));
1560 * Allocate a buffer header. If none currently available, allocate
1564 bio_bhdr_alloc(void)
1566 struct buf
*dp
, *sdp
;
1571 mutex_enter(&bhdr_lock
);
1572 if (bhdrlist
!= NULL
) {
1574 bhdrlist
= bp
->av_forw
;
1575 mutex_exit(&bhdr_lock
);
1579 mutex_exit(&bhdr_lock
);
1582 * Need to allocate a new pool. If the system is currently
1583 * out of memory, then try freeing things on the freelist.
1585 dp
= kmem_zalloc(sizeof (struct buf
) * v
.v_buf
, KM_NOSLEEP
);
1588 * System can't give us a pool of headers, try
1589 * recycling from the free lists.
1591 bio_recycle(BIO_HEADER
, 0);
1594 for (i
= 0; i
< v
.v_buf
; i
++, dp
++) {
1596 * The next two lines are needed since NODEV
1597 * is -1 and not NULL
1599 dp
->b_dev
= (o_dev_t
)NODEV
;
1601 dp
->av_forw
= dp
+ 1;
1602 sema_init(&dp
->b_sem
, 1, NULL
, SEMA_DEFAULT
,
1604 sema_init(&dp
->b_io
, 0, NULL
, SEMA_DEFAULT
,
1608 mutex_enter(&bhdr_lock
);
1609 (--dp
)->av_forw
= bhdrlist
; /* Fix last pointer */
1613 bhdrlist
= bp
->av_forw
;
1614 mutex_exit(&bhdr_lock
);
1623 bio_bhdr_free(struct buf
*bp
)
1625 ASSERT(bp
->b_back
== NULL
);
1626 ASSERT(bp
->b_forw
== NULL
);
1627 ASSERT(bp
->av_back
== NULL
);
1628 ASSERT(bp
->av_forw
== NULL
);
1629 ASSERT(bp
->b_un
.b_addr
== NULL
);
1630 ASSERT(bp
->b_dev
== (o_dev_t
)NODEV
);
1631 ASSERT(bp
->b_edev
== NODEV
);
1632 ASSERT(bp
->b_flags
== 0);
1634 mutex_enter(&bhdr_lock
);
1635 bp
->av_forw
= bhdrlist
;
1637 mutex_exit(&bhdr_lock
);
1641 * If we haven't gone over the high water mark, it's o.k. to
1642 * allocate more buffer space, otherwise recycle buffers
1643 * from the freelist until enough memory is free for a bsize request.
1645 * We account for this memory, even though
1646 * we don't allocate it here.
1649 bio_mem_get(long bsize
)
1651 mutex_enter(&bfree_lock
);
1652 if (bfreelist
.b_bufsize
> bsize
) {
1653 bfreelist
.b_bufsize
-= bsize
;
1654 mutex_exit(&bfree_lock
);
1657 mutex_exit(&bfree_lock
);
1658 bio_recycle(BIO_MEM
, bsize
);
1662 * flush a list of delayed write buffers.
1663 * (currently used only by bio_recycle below.)
1666 bio_flushlist(struct buf
*delwri_list
)
1670 while (delwri_list
!= EMPTY_LIST
) {
1672 bp
->b_flags
|= B_AGE
| B_ASYNC
;
1673 if (bp
->b_vp
== NULL
) { /* !ufs */
1676 UFS_BWRITE(VTOI(bp
->b_vp
)->i_ufsvfs
, bp
);
1678 delwri_list
= bp
->b_list
;
1684 * Start recycling buffers on the freelist for one of 2 reasons:
1685 * - we need a buffer header
1686 * - we need to free up memory
1687 * Once started we continue to recycle buffers until the B_AGE
1691 bio_recycle(int want
, long bsize
)
1693 struct buf
*bp
, *dp
, *dwp
, *nbp
;
1698 struct buf
*delwri_list
= EMPTY_LIST
;
1704 start
= end
= lastindex
;
1708 dp
= (struct buf
*)hp
;
1717 if (!sema_tryp(&bp
->b_sem
)) {
1722 * Do we really want to nuke all of the B_AGE stuff??
1724 if ((bp
->b_flags
& B_AGE
) == 0 && found
) {
1728 return; /* All done */
1731 ASSERT(MUTEX_HELD(&hp
->b_lock
));
1732 ASSERT(!(bp
->b_flags
& B_DELWRI
));
1737 * Remove bhdr from cache, free up memory,
1738 * and add the hdr to the freelist.
1743 if (bp
->b_bufsize
) {
1744 kmem_free(bp
->b_un
.b_addr
, bp
->b_bufsize
);
1745 bp
->b_un
.b_addr
= NULL
;
1746 mutex_enter(&bfree_lock
);
1747 bfreelist
.b_bufsize
+= bp
->b_bufsize
;
1748 mutex_exit(&bfree_lock
);
1751 bp
->b_dev
= (o_dev_t
)NODEV
;
1756 if (want
== BIO_HEADER
) {
1759 ASSERT(want
== BIO_MEM
);
1760 if (!found
&& bfreelist
.b_bufsize
>= bsize
) {
1761 /* Account for the memory we want */
1762 mutex_enter(&bfree_lock
);
1763 if (bfreelist
.b_bufsize
>= bsize
) {
1764 bfreelist
.b_bufsize
-= bsize
;
1767 mutex_exit(&bfree_lock
);
1772 * Since we dropped hmp start from the
1781 * Look at the delayed write list.
1782 * First gather into a private list, then write them.
1784 dwp
= (struct buf
*)&dwbuf
[start
];
1785 mutex_enter(&blist_lock
);
1788 for (bp
= dwp
->av_forw
; bp
!= dwp
; bp
= nbp
) {
1793 if (!sema_tryp(&bp
->b_sem
))
1795 ASSERT(bp
->b_flags
& B_DELWRI
);
1797 * Do we really want to nuke all of the B_AGE stuff??
1800 if ((bp
->b_flags
& B_AGE
) == 0 && found
) {
1804 mutex_exit(&blist_lock
);
1805 bio_flushlist(delwri_list
);
1806 mutex_enter(&blist_lock
);
1808 if (bio_flinv_cv_wanted
) {
1809 bio_flinv_cv_wanted
= 0;
1810 cv_broadcast(&bio_flushinval_cv
);
1812 mutex_exit(&blist_lock
);
1813 return; /* All done */
1817 * If the buffer is already on a flush or
1818 * invalidate list then just skip it.
1820 if (bp
->b_list
!= NULL
) {
1825 * We are still on the same bucket.
1829 bp
->b_list
= delwri_list
;
1833 mutex_exit(&blist_lock
);
1834 bio_flushlist(delwri_list
);
1835 delwri_list
= EMPTY_LIST
;
1836 mutex_enter(&blist_lock
);
1838 if (bio_flinv_cv_wanted
) {
1839 bio_flinv_cv_wanted
= 0;
1840 cv_broadcast(&bio_flushinval_cv
);
1842 mutex_exit(&blist_lock
);
1843 start
= (start
+ 1) % v
.v_hbuf
;
1845 } while (start
!= end
);
1851 * Free lists exhausted and we haven't satisfied the request.
1852 * Wait here for more entries to be added to freelist.
1853 * Because this might have just happened, make it timed.
1855 mutex_enter(&bfree_lock
);
1856 bfreelist
.b_flags
|= B_WANTED
;
1857 (void) cv_reltimedwait(&bio_mem_cv
, &bfree_lock
, hz
, TR_CLOCK_TICK
);
1858 mutex_exit(&bfree_lock
);
1863 * See if the block is associated with some buffer
1864 * (mainly to avoid getting hung up on a wait in breada).
1867 bio_incore(dev_t dev
, daddr_t blkno
)
1874 index
= bio_bhash(dev
, blkno
);
1875 dp
= (struct buf
*)&hbuf
[index
];
1876 hmp
= &hbuf
[index
].b_lock
;
1879 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1880 if (bp
->b_blkno
== blkno
&& bp
->b_edev
== dev
&&
1881 (bp
->b_flags
& B_STALE
) == 0) {
1891 bio_pageio_done(struct buf
*bp
)
1893 if (bp
->b_flags
& B_PAGEIO
) {
1895 if (bp
->b_flags
& B_REMAPPED
)
1898 if (bp
->b_flags
& B_READ
)
1899 pvn_read_done(bp
->b_pages
, bp
->b_flags
);
1901 pvn_write_done(bp
->b_pages
, B_WRITE
| bp
->b_flags
);
1904 ASSERT(bp
->b_flags
& B_REMAPPED
);
1911 * bioerror(9F) - indicate error in buffer header
1912 * If 'error' is zero, remove the error indication.
1915 bioerror(struct buf
*bp
, int error
)
1919 ASSERT(SEMA_HELD(&bp
->b_sem
));
1922 bp
->b_flags
|= B_ERROR
;
1924 bp
->b_flags
&= ~B_ERROR
;
1926 bp
->b_error
= error
;
1930 * bioreset(9F) - reuse a private buffer header after I/O is complete
1933 bioreset(struct buf
*bp
)
1942 * biosize(9F) - return size of a buffer header
1947 return (sizeof (struct buf
));
1951 * biomodified(9F) - check if buffer is modified
1954 biomodified(struct buf
*bp
)
1962 if ((bp
->b_flags
& B_PAGEIO
) == 0) {
1966 npf
= btopr(bp
->b_bcount
+ ((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
));
1969 ppattr
= hat_pagesync(pp
, HAT_SYNC_DONTZERO
|
1970 HAT_SYNC_STOPON_MOD
);
1981 * bioinit(9F) - initialize a buffer structure
1984 bioinit(struct buf
*bp
)
1986 bzero(bp
, sizeof (struct buf
));
1987 sema_init(&bp
->b_sem
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1988 sema_init(&bp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1993 * biofini(9F) - uninitialize a buffer structure
1996 biofini(struct buf
*bp
)
1998 sema_destroy(&bp
->b_io
);
1999 sema_destroy(&bp
->b_sem
);
2003 * bioclone(9F) - clone a buffer
2006 bioclone(struct buf
*bp
, off_t off
, size_t len
, dev_t dev
, daddr_t blkno
,
2007 int (*iodone
)(struct buf
*), struct buf
*bp_mem
, int sleep
)
2012 if (bp_mem
== NULL
) {
2013 bufp
= kmem_alloc(sizeof (struct buf
), sleep
);
2023 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2027 * The cloned buffer does not inherit the B_REMAPPED flag.
2029 bufp
->b_flags
= (bp
->b_flags
& BUF_CLONE_FLAGS
) | B_BUSY
;
2030 bufp
->b_bcount
= len
;
2031 bufp
->b_blkno
= blkno
;
2032 bufp
->b_iodone
= iodone
;
2033 bufp
->b_proc
= bp
->b_proc
;
2035 bufp
->b_file
= bp
->b_file
;
2036 bufp
->b_offset
= bp
->b_offset
;
2038 if (bp
->b_flags
& B_SHADOW
) {
2039 ASSERT(bp
->b_shadow
);
2040 ASSERT(bp
->b_flags
& B_PHYS
);
2042 bufp
->b_shadow
= bp
->b_shadow
+
2043 btop(((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
) + off
);
2044 bufp
->b_un
.b_addr
= (caddr_t
)((uintptr_t)bp
->b_un
.b_addr
+ off
);
2045 if (bp
->b_flags
& B_REMAPPED
)
2046 bufp
->b_proc
= NULL
;
2048 if (bp
->b_flags
& B_PAGEIO
) {
2054 o
= ((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
) + off
;
2055 for (i
= btop(o
); i
> 0; i
--) {
2059 bufp
->b_un
.b_addr
= (caddr_t
)(o
& PAGEOFFSET
);
2062 (caddr_t
)((uintptr_t)bp
->b_un
.b_addr
+ off
);
2063 if (bp
->b_flags
& B_REMAPPED
)
2064 bufp
->b_proc
= NULL
;