4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2011 Joyent, Inc. All rights reserved.
28 * Copyright (c) 2016 by Delphix. All rights reserved.
31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
32 /* All Rights Reserved */
35 * University Copyright- Copyright (c) 1982, 1986, 1988
36 * The Regents of the University of California
39 * University Acknowledgment- Portions of this document are derived from
40 * software developed by the University of California, Berkeley, and its
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/sysmacros.h>
48 #include <sys/cpuvar.h>
49 #include <sys/errno.h>
50 #include <sys/debug.h>
53 #include <sys/vnode.h>
54 #include <sys/bitmap.h>
55 #include <sys/cmn_err.h>
58 #include <sys/atomic.h>
59 #include <vm/seg_kmem.h>
62 #include <sys/vtrace.h>
63 #include <sys/fs/ufs_inode.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_log.h>
66 #include <sys/systm.h>
71 static kmutex_t blist_lock
; /* protects b_list */
72 static kmutex_t bhdr_lock
; /* protects the bhdrlist */
73 static kmutex_t bfree_lock
; /* protects the bfreelist structure */
75 struct hbuf
*hbuf
; /* Hash buckets */
76 struct dwbuf
*dwbuf
; /* Delayed write buckets */
77 static struct buf
*bhdrlist
; /* buf header free list */
78 static int nbuf
; /* number of buffer headers allocated */
80 static int lastindex
; /* Reference point on where to start */
81 /* when looking for free buffers */
83 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
84 #define EMPTY_LIST ((struct buf *)-1)
86 static kcondvar_t bio_mem_cv
; /* Condition variables */
87 static kcondvar_t bio_flushinval_cv
;
88 static int bio_doingflush
; /* flush in progress */
89 static int bio_doinginval
; /* inval in progress */
90 static int bio_flinv_cv_wanted
; /* someone waiting for cv */
93 * Statistics on the buffer cache
95 struct biostats biostats
= {
96 { "buffer_cache_lookups", KSTAT_DATA_UINT32
},
97 { "buffer_cache_hits", KSTAT_DATA_UINT32
},
98 { "new_buffer_requests", KSTAT_DATA_UINT32
},
99 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32
},
100 { "buffers_locked_by_someone", KSTAT_DATA_UINT32
},
101 { "duplicate_buffers_found", KSTAT_DATA_UINT32
}
107 kstat_named_t
*biostats_ptr
= (kstat_named_t
*)&biostats
;
108 uint_t biostats_ndata
= (uint_t
)(sizeof (biostats
) /
109 sizeof (kstat_named_t
));
112 * Statistics on ufs buffer cache
113 * Not protected by locks
115 struct ufsbiostats ub
= {
116 { "breads", KSTAT_DATA_UINT32
},
117 { "bwrites", KSTAT_DATA_UINT32
},
118 { "fbiwrites", KSTAT_DATA_UINT32
},
119 { "getpages", KSTAT_DATA_UINT32
},
120 { "getras", KSTAT_DATA_UINT32
},
121 { "putsyncs", KSTAT_DATA_UINT32
},
122 { "putasyncs", KSTAT_DATA_UINT32
},
123 { "putpageios", KSTAT_DATA_UINT32
},
127 * more UFS Logging eccentricities...
129 * required since "#pragma weak ..." doesn't work in reverse order.
130 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
131 * to ufs routines don't get plugged into bio.c calls so
132 * we initialize it when setting up the "lufsops" table
133 * in "lufs.c:_init()"
135 void (*bio_lufs_strategy
)(void *, buf_t
*);
136 void (*bio_snapshot_strategy
)(void *, buf_t
*);
139 /* Private routines */
140 static struct buf
*bio_getfreeblk(long);
141 static void bio_mem_get(long);
142 static void bio_bhdr_free(struct buf
*);
143 static struct buf
*bio_bhdr_alloc(void);
144 static void bio_recycle(int, long);
145 static void bio_pageio_done(struct buf
*);
146 static int bio_incore(dev_t
, daddr_t
);
149 * Buffer cache constants
151 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
152 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
153 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
154 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
155 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
156 #define BIO_HASHLEN 4 /* Target length of hash chains */
159 /* Flags for bio_recycle() */
160 #define BIO_HEADER 0x01
163 extern int bufhwm
; /* User tunable - high water mark for mem */
164 extern int bufhwm_pct
; /* ditto - given in % of physmem */
167 * The following routines allocate and free
168 * buffers with various side effects. In general the
169 * arguments to an allocate routine are a device and
170 * a block number, and the value is a pointer to
171 * to the buffer header; the buffer returned is locked with a
172 * binary semaphore so that no one else can touch it. If the block was
173 * already in core, no I/O need be done; if it is
174 * already locked, the process waits until it becomes free.
175 * The following routines allocate a buffer:
179 * Eventually the buffer must be released, possibly with the
180 * side effect of writing it out, by using one of
181 * bwrite/BWRITE/brwrite
186 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
187 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
188 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
189 * B_DONE is still used to denote a buffer with I/O complete on it.
191 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
192 * should not be used where a very accurate count of the free buffers is
197 * Common code for reading a buffer with various options
199 * Read in (if necessary) the block and return a buffer pointer.
202 bread_common(void *arg
, dev_t dev
, daddr_t blkno
, long bsize
)
204 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)arg
;
206 klwp_t
*lwp
= ttolwp(curthread
);
208 CPU_STATS_ADD_K(sys
, lread
, 1);
209 bp
= getblk_common(ufsvfsp
, dev
, blkno
, bsize
, true);
210 if (bp
->b_flags
& B_DONE
)
212 bp
->b_flags
|= B_READ
;
213 ASSERT(bp
->b_bcount
== bsize
);
214 if (ufsvfsp
== NULL
) { /* !ufs */
215 (void) bdev_strategy(bp
);
216 } else if (ufsvfsp
->vfs_log
&& bio_lufs_strategy
!= NULL
) {
218 (*bio_lufs_strategy
)(ufsvfsp
->vfs_log
, bp
);
219 } else if (ufsvfsp
->vfs_snapshot
&& bio_snapshot_strategy
!= NULL
) {
220 /* ufs && snapshots */
221 (*bio_snapshot_strategy
)(&ufsvfsp
->vfs_snapshot
, bp
);
223 ufsvfsp
->vfs_iotstamp
= ddi_get_lbolt();
224 ub
.ub_breads
.value
.ul
++; /* ufs && !logging */
225 (void) bdev_strategy(bp
);
228 lwp
->lwp_ru
.inblock
++;
229 CPU_STATS_ADD_K(sys
, bread
, 1);
235 * Read in the block, like bread, but also start I/O on the
236 * read-ahead block (which is not allocated to the caller).
239 breada(dev_t dev
, daddr_t blkno
, daddr_t rablkno
, long bsize
)
241 struct buf
*bp
, *rabp
;
242 klwp_t
*lwp
= ttolwp(curthread
);
245 if (!bio_incore(dev
, blkno
)) {
246 CPU_STATS_ADD_K(sys
, lread
, 1);
247 bp
= getblk(dev
, blkno
, bsize
);
248 if ((bp
->b_flags
& B_DONE
) == 0) {
249 bp
->b_flags
|= B_READ
;
250 bp
->b_bcount
= bsize
;
251 (void) bdev_strategy(bp
);
253 lwp
->lwp_ru
.inblock
++;
254 CPU_STATS_ADD_K(sys
, bread
, 1);
257 if (rablkno
&& bfreelist
.b_bcount
> 1 &&
258 !bio_incore(dev
, rablkno
)) {
259 rabp
= getblk(dev
, rablkno
, bsize
);
260 if (rabp
->b_flags
& B_DONE
)
263 rabp
->b_flags
|= B_READ
|B_ASYNC
;
264 rabp
->b_bcount
= bsize
;
265 (void) bdev_strategy(rabp
);
267 lwp
->lwp_ru
.inblock
++;
268 CPU_STATS_ADD_K(sys
, bread
, 1);
272 return (bread(dev
, blkno
, bsize
));
278 * Common code for writing a buffer with various options.
280 * force_wait - wait for write completion regardless of B_ASYNC flag
281 * do_relse - release the buffer when we are done
282 * clear_flags - flags to clear from the buffer
285 bwrite_common(void *arg
, struct buf
*bp
, bool force_wait
, bool do_relse
,
288 register int do_wait
;
289 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)arg
;
291 klwp_t
*lwp
= ttolwp(curthread
);
294 ASSERT(SEMA_HELD(&bp
->b_sem
));
296 bp
->b_flags
&= ~clear_flags
;
298 lwp
->lwp_ru
.oublock
++;
300 cpup
= CPU
; /* get pointer AFTER preemption is disabled */
301 CPU_STATS_ADDQ(cpup
, sys
, lwrite
, 1);
302 CPU_STATS_ADDQ(cpup
, sys
, bwrite
, 1);
303 do_wait
= ((flag
& B_ASYNC
) == 0 || force_wait
);
305 CPU_STATS_ADDQ(cpup
, sys
, bawrite
, 1);
307 if (ufsvfsp
== NULL
) {
308 (void) bdev_strategy(bp
);
309 } else if (ufsvfsp
->vfs_log
&& bio_lufs_strategy
!= NULL
) {
311 (*bio_lufs_strategy
)(ufsvfsp
->vfs_log
, bp
);
312 } else if (ufsvfsp
->vfs_snapshot
&& bio_snapshot_strategy
!= NULL
) {
313 /* ufs && snapshots */
314 (*bio_snapshot_strategy
)(&ufsvfsp
->vfs_snapshot
, bp
);
316 ub
.ub_bwrites
.value
.ul
++; /* ufs && !logging */
317 (void) bdev_strategy(bp
);
328 * Release the buffer, marking it so that if it is grabbed
329 * for another purpose it will be written out before being
330 * given up (e.g. when writing a partial block where it is
331 * assumed that another write for the same block will soon follow).
332 * Also save the time that the block is first marked as delayed
333 * so that it will be written in a reasonable time.
336 bdwrite(struct buf
*bp
)
338 ASSERT(SEMA_HELD(&bp
->b_sem
));
339 CPU_STATS_ADD_K(sys
, lwrite
, 1);
340 if ((bp
->b_flags
& B_DELWRI
) == 0)
341 bp
->b_start
= ddi_get_lbolt();
343 * B_DONE allows others to use the buffer, B_DELWRI causes the
344 * buffer to be written before being reused, and setting b_resid
345 * to zero says the buffer is complete.
347 bp
->b_flags
|= B_DELWRI
| B_DONE
;
353 * Release the buffer, start I/O on it, but don't wait for completion.
356 bawrite(struct buf
*bp
)
358 ASSERT(SEMA_HELD(&bp
->b_sem
));
360 /* Use bfreelist.b_bcount as a weird-ass heuristic */
361 if (bfreelist
.b_bcount
> 4)
362 bp
->b_flags
|= B_ASYNC
;
367 * Release the buffer, with no I/O implied.
370 brelse(struct buf
*bp
)
379 ASSERT(SEMA_HELD(&bp
->b_sem
));
382 * Clear the retry write flag if the buffer was written without
383 * error. The presence of B_DELWRI means the buffer has not yet
384 * been written and the presence of B_ERROR means that an error
385 * is still occurring.
387 if ((bp
->b_flags
& (B_ERROR
| B_DELWRI
| B_RETRYWRI
)) == B_RETRYWRI
) {
388 bp
->b_flags
&= ~B_RETRYWRI
;
391 /* Check for anomalous conditions */
392 if (bp
->b_flags
& (B_ERROR
|B_NOCACHE
)) {
393 if (bp
->b_flags
& B_NOCACHE
) {
394 /* Don't add to the freelist. Destroy it now */
395 kmem_free(bp
->b_un
.b_addr
, bp
->b_bufsize
);
396 sema_destroy(&bp
->b_sem
);
397 sema_destroy(&bp
->b_io
);
398 kmem_free(bp
, sizeof (struct buf
));
402 * If a write failed and we are supposed to retry write,
403 * don't toss the buffer. Keep it around and mark it
404 * delayed write in the hopes that it will eventually
405 * get flushed (and still keep the system running.)
407 if ((bp
->b_flags
& (B_READ
| B_RETRYWRI
)) == B_RETRYWRI
) {
408 bp
->b_flags
|= B_DELWRI
;
409 /* keep fsflush from trying continuously to flush */
410 bp
->b_start
= ddi_get_lbolt();
412 bp
->b_flags
|= B_AGE
|B_STALE
;
413 bp
->b_flags
&= ~B_ERROR
;
418 * If delayed write is set then put in on the delayed
419 * write list instead of the free buffer list.
421 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
422 hmp
= &hbuf
[index
].b_lock
;
426 dp
= (struct buf
*)hp
;
429 * Make sure that the number of entries on this list are
430 * Zero <= count <= total # buffers
432 ASSERT(hp
->b_length
>= 0);
433 ASSERT(hp
->b_length
< nbuf
);
435 hp
->b_length
++; /* We are adding this buffer */
437 if (bp
->b_flags
& B_DELWRI
) {
439 * This buffer goes on the delayed write buffer list
441 dp
= (struct buf
*)&dwbuf
[index
];
443 ASSERT(bp
->b_bufsize
> 0);
444 ASSERT(bp
->b_bcount
> 0);
445 ASSERT(bp
->b_un
.b_addr
!= NULL
);
447 if (bp
->b_flags
& B_AGE
) {
448 backp
= &dp
->av_forw
;
449 (*backp
)->av_back
= bp
;
450 bp
->av_forw
= *backp
;
454 backp
= &dp
->av_back
;
455 (*backp
)->av_forw
= bp
;
456 bp
->av_back
= *backp
;
462 if (bfreelist
.b_flags
& B_WANTED
) {
464 * Should come here very very rarely.
466 mutex_enter(&bfree_lock
);
467 if (bfreelist
.b_flags
& B_WANTED
) {
468 bfreelist
.b_flags
&= ~B_WANTED
;
469 cv_broadcast(&bio_mem_cv
);
471 mutex_exit(&bfree_lock
);
474 bp
->b_flags
&= ~(B_WANTED
|B_BUSY
|B_ASYNC
);
476 * Don't let anyone get the buffer off the freelist before we
477 * release our hold on it.
483 * Return a count of the number of B_BUSY buffers in the system
484 * Can only be used as a good estimate. If 'cleanit' is set,
485 * try to flush all bufs.
488 bio_busy(int cleanit
)
495 for (i
= 0; i
< v
.v_hbuf
; i
++) {
496 dp
= (struct buf
*)&hbuf
[i
];
497 hmp
= &hbuf
[i
].b_lock
;
500 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
501 if (bp
->b_flags
& B_BUSY
)
507 if (cleanit
&& busy
!= 0) {
515 * Assign a buffer for the given block. If the appropriate
516 * block is already associated, return it; otherwise search
517 * for the oldest non-busy buffer and reassign it.
520 getblk_common(void * arg
, dev_t dev
, daddr_t blkno
, long bsize
, bool errflg
)
522 ufsvfs_t
*ufsvfsp
= (struct ufsvfs
*)arg
;
525 struct buf
*nbp
= NULL
;
531 if (getmajor(dev
) >= devcnt
)
532 cmn_err(CE_PANIC
, "blkdev");
534 biostats
.bio_lookup
.value
.ui32
++;
536 index
= bio_bhash(dev
, blkno
);
538 dp
= (struct buf
*)hp
;
543 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
544 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
545 (bp
->b_flags
& B_STALE
))
548 * Avoid holding the hash lock in the event that
549 * the buffer is locked by someone. Since the hash chain
550 * may change when we drop the hash lock
551 * we have to start at the beginning of the chain if the
552 * buffer identity/contents aren't valid.
554 if (!sema_tryp(&bp
->b_sem
)) {
555 biostats
.bio_bufbusy
.value
.ui32
++;
558 * OK, we are dealing with a busy buffer.
559 * In the case that we are panicking and we
560 * got called from bread(), we have some chance
561 * for error recovery. So better bail out from
562 * here since sema_p() won't block. If we got
563 * called directly from ufs routines, there is
564 * no way to report an error yet.
566 if (panicstr
&& errflg
)
569 * For the following line of code to work
570 * correctly never kmem_free the buffer "header".
573 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
574 (bp
->b_flags
& B_STALE
)) {
577 goto loop
; /* start over */
582 biostats
.bio_hit
.value
.ui32
++;
583 bp
->b_flags
&= ~B_AGE
;
586 * Yank it off the free/delayed write lists
592 ASSERT((bp
->b_flags
& B_NOCACHE
) == 0);
596 * Make the common path short.
598 ASSERT(SEMA_HELD(&bp
->b_sem
));
602 biostats
.bio_bufdup
.value
.ui32
++;
605 * The buffer must have entered during the lock upgrade
606 * so free the new buffer we allocated and return the
609 kmem_free(nbp
->b_un
.b_addr
, nbp
->b_bufsize
);
610 nbp
->b_un
.b_addr
= NULL
;
613 * Account for the memory
615 mutex_enter(&bfree_lock
);
616 bfreelist
.b_bufsize
+= nbp
->b_bufsize
;
617 mutex_exit(&bfree_lock
);
620 * Destroy buf identity, and place on avail list
622 nbp
->b_dev
= (o_dev_t
)NODEV
;
631 ASSERT(SEMA_HELD(&bp
->b_sem
));
636 * bio_getfreeblk may block so check the hash chain again.
640 nbp
= bio_getfreeblk(bsize
);
646 * New buffer. Assign nbp and stick it on the hash.
648 nbp
->b_flags
= B_BUSY
;
650 nbp
->b_dev
= (o_dev_t
)cmpdev(dev
);
651 nbp
->b_blkno
= blkno
;
652 nbp
->b_iodone
= NULL
;
653 nbp
->b_bcount
= bsize
;
655 * If we are given a ufsvfsp and the vfs_root field is NULL
656 * then this must be I/O for a superblock. A superblock's
657 * buffer is set up in mountfs() and there is no root vnode
660 if (ufsvfsp
&& ufsvfsp
->vfs_root
) {
661 nbp
->b_vp
= ufsvfsp
->vfs_root
;
666 ASSERT((nbp
->b_flags
& B_NOCACHE
) == 0);
671 ASSERT(SEMA_HELD(&nbp
->b_sem
));
677 * Come here in case of an internal error. At this point we couldn't
678 * get a buffer, but we have to return one. Hence we allocate some
679 * kind of error reply buffer on the fly. This buffer is marked as
680 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
681 * - B_ERROR will indicate error to the caller.
682 * - B_DONE will prevent us from reading the buffer from
684 * - B_NOCACHE will cause that this buffer gets free'd in
690 sema_p(&errbp
->b_sem
);
691 errbp
->b_flags
&= ~B_BUSY
;
692 errbp
->b_flags
|= (B_ERROR
| B_DONE
);
697 * Get an empty block, not assigned to any particular device.
698 * Returns a locked buffer that is not on any hash or free list.
705 bp
= kmem_alloc(sizeof (struct buf
), KM_SLEEP
);
707 bp
->av_forw
= bp
->av_back
= NULL
;
708 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_SLEEP
);
709 bp
->b_bufsize
= bsize
;
710 bp
->b_flags
= B_BUSY
| B_NOCACHE
| B_AGE
;
711 bp
->b_dev
= (o_dev_t
)NODEV
;
714 bp
->b_bcount
= bsize
;
720 * Interface of geteblk() is kept intact to maintain driver compatibility.
721 * Use ngeteblk() to allocate block size other than 1 KB.
726 return (ngeteblk((long)1024));
730 * Return a buffer w/o sleeping
733 trygetblk(dev_t dev
, daddr_t blkno
)
741 index
= bio_bhash(dev
, blkno
);
745 if (!mutex_tryenter(hmp
))
748 dp
= (struct buf
*)hp
;
749 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
750 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
751 (bp
->b_flags
& B_STALE
))
754 * Get access to a valid buffer without sleeping
756 if (sema_tryp(&bp
->b_sem
)) {
757 if (bp
->b_flags
& B_DONE
) {
774 * Wait for I/O completion on the buffer; return errors
778 iowait(struct buf
*bp
)
780 ASSERT(SEMA_HELD(&bp
->b_sem
));
781 return (biowait(bp
));
785 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
786 * and wake up anyone waiting for it.
789 iodone(struct buf
*bp
)
791 ASSERT(SEMA_HELD(&bp
->b_sem
));
796 * Zero the core associated with a buffer.
799 clrbuf(struct buf
*bp
)
801 ASSERT(SEMA_HELD(&bp
->b_sem
));
802 bzero(bp
->b_un
.b_addr
, bp
->b_bcount
);
808 * Make sure all write-behind blocks on dev (or NODEV for all)
816 struct buf
*delwri_list
= EMPTY_LIST
;
820 mutex_enter(&blist_lock
);
822 * Wait for any invalidates or flushes ahead of us to finish.
823 * We really could split blist_lock up per device for better
826 while (bio_doinginval
|| bio_doingflush
) {
827 bio_flinv_cv_wanted
= 1;
828 cv_wait(&bio_flushinval_cv
, &blist_lock
);
832 * Gather all B_DELWRI buffer for device.
833 * Lock ordering is b_sem > hash lock (brelse).
834 * Since we are finding the buffer via the delayed write list,
835 * it may be busy and we would block trying to get the
836 * b_sem lock while holding hash lock. So transfer all the
837 * candidates on the delwri_list and then drop the hash locks.
839 for (i
= 0; i
< v
.v_hbuf
; i
++) {
840 hmp
= &hbuf
[i
].b_lock
;
841 dp
= (struct buf
*)&dwbuf
[i
];
843 for (bp
= dp
->av_forw
; bp
!= dp
; bp
= bp
->av_forw
) {
844 if (dev
== NODEV
|| bp
->b_edev
== dev
) {
845 if (bp
->b_list
== NULL
) {
846 bp
->b_list
= delwri_list
;
853 mutex_exit(&blist_lock
);
856 * Now that the hash locks have been dropped grab the semaphores
857 * and write back all the buffers that have B_DELWRI set.
859 while (delwri_list
!= EMPTY_LIST
) {
862 sema_p(&bp
->b_sem
); /* may block */
863 if ((dev
!= bp
->b_edev
&& dev
!= NODEV
) ||
864 (panicstr
&& bp
->b_flags
& B_BUSY
)) {
866 delwri_list
= bp
->b_list
;
868 continue; /* No longer a candidate */
870 if (bp
->b_flags
& B_DELWRI
) {
871 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
874 dp
= (struct buf
*)hp
;
876 bp
->b_flags
|= B_ASYNC
;
881 if (bp
->b_vp
== NULL
) { /* !ufs */
884 UFS_BWRITE(VTOI(bp
->b_vp
)->i_ufsvfs
, bp
);
889 delwri_list
= bp
->b_list
;
892 mutex_enter(&blist_lock
);
894 if (bio_flinv_cv_wanted
) {
895 bio_flinv_cv_wanted
= 0;
896 cv_broadcast(&bio_flushinval_cv
);
898 mutex_exit(&blist_lock
);
902 * Ensure that a specified block is up-to-date on disk.
905 blkflush(dev_t dev
, daddr_t blkno
)
909 struct buf
*sbp
= NULL
;
913 index
= bio_bhash(dev
, blkno
);
915 dp
= (struct buf
*)hp
;
919 * Identify the buffer in the cache belonging to
920 * this device and blkno (if any).
923 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
924 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
925 (bp
->b_flags
& B_STALE
))
934 * Now check the buffer we have identified and
935 * make sure it still belongs to the device and is B_DELWRI
938 if (sbp
->b_blkno
== blkno
&& sbp
->b_edev
== dev
&&
939 (sbp
->b_flags
& (B_DELWRI
|B_STALE
)) == B_DELWRI
) {
945 * XXX - There is nothing to guarantee a synchronous
946 * write here if the B_ASYNC flag is set. This needs
947 * some investigation.
949 if (sbp
->b_vp
== NULL
) { /* !ufs */
950 bwrite(sbp
); /* synchronous write */
952 UFS_BWRITE(VTOI(sbp
->b_vp
)->i_ufsvfs
, sbp
);
960 * Same as binval, except can force-invalidate delayed-write buffers
961 * (which are not be already flushed because of device errors). Also
962 * makes sure that the retry write flag is cleared.
965 bfinval(dev_t dev
, int force
)
969 struct buf
*binval_list
= EMPTY_LIST
;
975 mutex_enter(&blist_lock
);
977 * Wait for any flushes ahead of us to finish, it's ok to
978 * do invalidates in parallel.
980 while (bio_doingflush
) {
981 bio_flinv_cv_wanted
= 1;
982 cv_wait(&bio_flushinval_cv
, &blist_lock
);
987 for (i
= 0; i
< v
.v_hbuf
; i
++) {
988 dp
= (struct buf
*)&hbuf
[i
];
989 hmp
= &hbuf
[i
].b_lock
;
992 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
993 if (bp
->b_edev
== dev
) {
994 if (bp
->b_list
== NULL
) {
995 bp
->b_list
= binval_list
;
1002 mutex_exit(&blist_lock
);
1004 /* Invalidate all bp's found */
1005 while (binval_list
!= EMPTY_LIST
) {
1009 if (bp
->b_edev
== dev
) {
1010 if (force
&& (bp
->b_flags
& B_DELWRI
)) {
1011 /* clear B_DELWRI, move to non-dw freelist */
1012 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
1013 hmp
= &hbuf
[index
].b_lock
;
1014 dp
= (struct buf
*)&hbuf
[index
];
1017 /* remove from delayed write freelist */
1020 /* add to B_AGE side of non-dw freelist */
1021 backp
= &dp
->av_forw
;
1022 (*backp
)->av_back
= bp
;
1023 bp
->av_forw
= *backp
;
1028 * make sure write retries and busy are cleared
1031 ~(B_BUSY
| B_DELWRI
| B_RETRYWRI
);
1034 if ((bp
->b_flags
& B_DELWRI
) == 0)
1035 bp
->b_flags
|= B_STALE
|B_AGE
;
1040 binval_list
= bp
->b_list
;
1043 mutex_enter(&blist_lock
);
1045 if (bio_flinv_cv_wanted
) {
1046 cv_broadcast(&bio_flushinval_cv
);
1047 bio_flinv_cv_wanted
= 0;
1049 mutex_exit(&blist_lock
);
1054 * If possible, invalidate blocks for a dev on demand
1059 (void) bfinval(dev
, 0);
1063 * Initialize the buffer I/O system by freeing
1064 * all buffers and setting all device hash buffer lists to empty.
1070 unsigned int i
, pct
;
1071 ulong_t bio_max_hwm
, bio_default_hwm
;
1074 * Maximum/Default values for bufhwm are set to the smallest of:
1075 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1076 * - 1/4 of kernel virtual memory
1077 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1078 * Additionally, in order to allow simple tuning by percentage of
1079 * physical memory, bufhwm_pct is used to calculate the default if
1080 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1082 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1083 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1085 bio_max_hwm
= MIN(physmem
/ BIO_MAX_PERCENT
,
1086 btop(vmem_size(heap_arena
, VMEM_FREE
)) / 4) * (PAGESIZE
/ 1024);
1087 bio_max_hwm
= MIN(INT32_MAX
, bio_max_hwm
);
1089 pct
= BIO_BUF_PERCENT
;
1090 if (bufhwm_pct
!= 0 &&
1091 ((pct
= 100 / bufhwm_pct
) < BIO_MAX_PERCENT
)) {
1092 pct
= BIO_BUF_PERCENT
;
1094 * Invalid user specified value, emit a warning.
1096 cmn_err(CE_WARN
, "binit: bufhwm_pct(%d) out of \
1097 range(1..%d). Using %d as default.",
1099 100 / BIO_MAX_PERCENT
, 100 / BIO_BUF_PERCENT
);
1102 bio_default_hwm
= MIN(physmem
/ pct
,
1103 btop(vmem_size(heap_arena
, VMEM_FREE
)) / 4) * (PAGESIZE
/ 1024);
1104 bio_default_hwm
= MIN(INT32_MAX
, bio_default_hwm
);
1106 if ((v
.v_bufhwm
= bufhwm
) == 0)
1107 v
.v_bufhwm
= bio_default_hwm
;
1109 if (v
.v_bufhwm
< BIO_MIN_HWM
|| v
.v_bufhwm
> bio_max_hwm
) {
1110 v
.v_bufhwm
= (int)bio_max_hwm
;
1112 * Invalid user specified value, emit a warning.
1115 "binit: bufhwm(%d) out \
1116 of range(%d..%lu). Using %lu as default",
1118 BIO_MIN_HWM
, bio_max_hwm
, bio_max_hwm
);
1122 * Determine the number of hash buckets. Default is to
1123 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1124 * Round up number to the next power of 2.
1126 v
.v_hbuf
= 1 << highbit((((ulong_t
)v
.v_bufhwm
* 1024) / MAXBSIZE
) /
1128 v
.v_hmask
= v
.v_hbuf
- 1;
1129 v
.v_buf
= BIO_BHDR_POOL
;
1131 hbuf
= kmem_zalloc(v
.v_hbuf
* sizeof (struct hbuf
), KM_SLEEP
);
1133 dwbuf
= kmem_zalloc(v
.v_hbuf
* sizeof (struct dwbuf
), KM_SLEEP
);
1135 bfreelist
.b_bufsize
= (size_t)v
.v_bufhwm
* 1024;
1137 bp
->b_forw
= bp
->b_back
= bp
->av_forw
= bp
->av_back
= bp
;
1139 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1140 hbuf
[i
].b_forw
= hbuf
[i
].b_back
= (struct buf
*)&hbuf
[i
];
1141 hbuf
[i
].av_forw
= hbuf
[i
].av_back
= (struct buf
*)&hbuf
[i
];
1144 * Initialize the delayed write buffer list.
1146 dwbuf
[i
].b_forw
= dwbuf
[i
].b_back
= (struct buf
*)&dwbuf
[i
];
1147 dwbuf
[i
].av_forw
= dwbuf
[i
].av_back
= (struct buf
*)&dwbuf
[i
];
1152 * Wait for I/O completion on the buffer; return error code.
1153 * If bp was for synchronous I/O, bp is invalid and associated
1154 * resources are freed on return.
1157 biowait(struct buf
*bp
)
1162 ASSERT(SEMA_HELD(&bp
->b_sem
));
1165 atomic_inc_64(&cpup
->cpu_stats
.sys
.iowait
);
1166 DTRACE_IO1(wait__start
, struct buf
*, bp
);
1169 * In case of panic, busy wait for completion
1172 while ((bp
->b_flags
& B_DONE
) == 0)
1177 DTRACE_IO1(wait__done
, struct buf
*, bp
);
1178 atomic_dec_64(&cpup
->cpu_stats
.sys
.iowait
);
1180 error
= geterror(bp
);
1181 if ((bp
->b_flags
& B_ASYNC
) == 0) {
1182 if (bp
->b_flags
& B_REMAPPED
)
1189 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1190 * and wake up anyone waiting for it.
1193 biodone(struct buf
*bp
)
1195 if (bp
->b_flags
& B_STARTED
) {
1196 DTRACE_IO1(done
, struct buf
*, bp
);
1197 bp
->b_flags
&= ~B_STARTED
;
1200 if (bp
->b_iodone
!= NULL
) {
1201 (*(bp
->b_iodone
))(bp
);
1204 ASSERT((bp
->b_flags
& B_DONE
) == 0);
1205 ASSERT(SEMA_HELD(&bp
->b_sem
));
1206 bp
->b_flags
|= B_DONE
;
1207 if (bp
->b_flags
& B_ASYNC
) {
1208 if (bp
->b_flags
& (B_PAGEIO
|B_REMAPPED
))
1209 bio_pageio_done(bp
);
1211 brelse(bp
); /* release bp to freelist */
1218 * Pick up the device's error number and pass it to the user;
1219 * if there is an error but the number is 0 set a generalized code.
1222 geterror(struct buf
*bp
)
1226 ASSERT(SEMA_HELD(&bp
->b_sem
));
1227 if (bp
->b_flags
& B_ERROR
) {
1228 error
= bp
->b_error
;
1236 * Support for pageio buffers.
1238 * This stuff should be generalized to provide a generalized bp
1239 * header facility that can be used for things other than pageio.
1243 * Allocate and initialize a buf struct for use with pageio.
1246 pageio_setup(struct page
*pp
, size_t len
, struct vnode
*vp
, int flags
)
1251 if (flags
& B_READ
) {
1252 CPU_STATS_ENTER_K();
1253 cpup
= CPU
; /* get pointer AFTER preemption is disabled */
1254 CPU_STATS_ADDQ(cpup
, vm
, pgin
, 1);
1255 CPU_STATS_ADDQ(cpup
, vm
, pgpgin
, btopr(len
));
1257 atomic_add_64(&curzone
->zone_pgpgin
, btopr(len
));
1259 if ((flags
& B_ASYNC
) == 0) {
1260 klwp_t
*lwp
= ttolwp(curthread
);
1262 lwp
->lwp_ru
.majflt
++;
1263 CPU_STATS_ADDQ(cpup
, vm
, maj_fault
, 1);
1266 * Update statistics for pages being paged in
1268 if (pp
!= NULL
&& pp
->p_vnode
!= NULL
) {
1269 if (IS_SWAPFSVP(pp
->p_vnode
)) {
1270 CPU_STATS_ADDQ(cpup
, vm
, anonpgin
, btopr(len
));
1271 atomic_add_64(&curzone
->zone_anonpgin
,
1274 if (pp
->p_vnode
->v_flag
& VVMEXEC
) {
1275 CPU_STATS_ADDQ(cpup
, vm
, execpgin
,
1277 atomic_add_64(&curzone
->zone_execpgin
,
1280 CPU_STATS_ADDQ(cpup
, vm
, fspgin
,
1282 atomic_add_64(&curzone
->zone_fspgin
,
1288 TRACE_1(TR_FAC_VM
, TR_PAGE_WS_IN
,
1289 "page_ws_in:pp %p", pp
);
1292 bp
= kmem_zalloc(sizeof (struct buf
), KM_SLEEP
);
1294 bp
->b_bufsize
= len
;
1296 bp
->b_flags
= B_PAGEIO
| B_NOCACHE
| B_BUSY
| flags
;
1298 sema_init(&bp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1300 /* Initialize bp->b_sem in "locked" state */
1301 sema_init(&bp
->b_sem
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1305 THREAD_KPRI_RELEASE_N(btopr(len
)); /* release kpri from page_locks */
1308 * Caller sets dev & blkno and can adjust
1309 * b_addr for page offset and can use bp_mapin
1310 * to make pages kernel addressable.
1316 pageio_done(struct buf
*bp
)
1318 ASSERT(SEMA_HELD(&bp
->b_sem
));
1319 if (bp
->b_flags
& B_REMAPPED
)
1323 ASSERT((bp
->b_flags
& B_NOCACHE
) != 0);
1325 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1326 sema_destroy(&bp
->b_sem
);
1327 sema_destroy(&bp
->b_io
);
1328 kmem_free(bp
, sizeof (struct buf
));
1332 * Check to see whether the buffers, except the one pointed by sbp,
1333 * associated with the device are busy.
1334 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1337 bcheck(dev_t dev
, struct buf
*sbp
)
1345 * check for busy bufs for this filesystem
1347 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1348 dp
= (struct buf
*)&hbuf
[i
];
1349 hmp
= &hbuf
[i
].b_lock
;
1352 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1354 * if buf is busy or dirty, then filesystem is busy
1356 if ((bp
->b_edev
== dev
) &&
1357 ((bp
->b_flags
& B_STALE
) == 0) &&
1358 (bp
->b_flags
& (B_DELWRI
|B_BUSY
)) &&
1370 * Hash two 32 bit entities.
1373 hash2ints(int x
, int y
)
1378 hash
= ((hash
* 7) + (x
>> 8)) - 1;
1379 hash
= ((hash
* 7) + (x
>> 16)) - 1;
1380 hash
= ((hash
* 7) + (x
>> 24)) - 1;
1381 hash
= ((hash
* 7) + y
) - 1;
1382 hash
= ((hash
* 7) + (y
>> 8)) - 1;
1383 hash
= ((hash
* 7) + (y
>> 16)) - 1;
1384 hash
= ((hash
* 7) + (y
>> 24)) - 1;
1391 * Return a new buffer struct.
1392 * Create a new buffer if we haven't gone over our high water
1393 * mark for memory, otherwise try to get one off the freelist.
1395 * Returns a locked buf that has no id and is not on any hash or free
1399 bio_getfreeblk(long bsize
)
1401 struct buf
*bp
, *dp
;
1407 * mutex_enter(&bfree_lock);
1408 * bfreelist.b_bufsize represents the amount of memory
1409 * mutex_exit(&bfree_lock); protect ref to bfreelist
1410 * we are allowed to allocate in the cache before we hit our hwm.
1412 bio_mem_get(bsize
); /* Account for our memory request */
1415 bp
= bio_bhdr_alloc(); /* Get a buf hdr */
1416 sema_p(&bp
->b_sem
); /* Should never fail */
1418 ASSERT(bp
->b_un
.b_addr
== NULL
);
1419 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_NOSLEEP
);
1420 if (bp
->b_un
.b_addr
!= NULL
) {
1422 * Make the common path short
1424 bp
->b_bufsize
= bsize
;
1425 ASSERT(SEMA_HELD(&bp
->b_sem
));
1430 save
= bp
; /* Save bp we allocated */
1431 start
= end
= lastindex
;
1433 biostats
.bio_bufwant
.value
.ui32
++;
1436 * Memory isn't available from the system now. Scan
1437 * the hash buckets till enough space is found.
1442 dp
= (struct buf
*)hp
;
1451 if (!sema_tryp(&bp
->b_sem
)) {
1457 * Since we are going down the freelist
1458 * associated with this hash bucket the
1459 * B_DELWRI flag should not be set.
1461 ASSERT(!(bp
->b_flags
& B_DELWRI
));
1463 if (bp
->b_bufsize
== bsize
) {
1470 * Didn't kmem_alloc any more, so don't
1473 mutex_enter(&bfree_lock
);
1474 bfreelist
.b_bufsize
+= bsize
;
1475 mutex_exit(&bfree_lock
);
1478 * Update the lastindex value.
1483 * Put our saved bp back on the list
1485 sema_v(&save
->b_sem
);
1486 bio_bhdr_free(save
);
1487 ASSERT(SEMA_HELD(&bp
->b_sem
));
1494 start
= ((start
+ 1) % v
.v_hbuf
);
1495 } while (start
!= end
);
1497 biostats
.bio_bufwait
.value
.ui32
++;
1498 bp
= save
; /* Use original bp */
1499 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_SLEEP
);
1502 bp
->b_bufsize
= bsize
;
1503 ASSERT(SEMA_HELD(&bp
->b_sem
));
1508 * Allocate a buffer header. If none currently available, allocate
1512 bio_bhdr_alloc(void)
1514 struct buf
*dp
, *sdp
;
1519 mutex_enter(&bhdr_lock
);
1520 if (bhdrlist
!= NULL
) {
1522 bhdrlist
= bp
->av_forw
;
1523 mutex_exit(&bhdr_lock
);
1527 mutex_exit(&bhdr_lock
);
1530 * Need to allocate a new pool. If the system is currently
1531 * out of memory, then try freeing things on the freelist.
1533 dp
= kmem_zalloc(sizeof (struct buf
) * v
.v_buf
, KM_NOSLEEP
);
1536 * System can't give us a pool of headers, try
1537 * recycling from the free lists.
1539 bio_recycle(BIO_HEADER
, 0);
1542 for (i
= 0; i
< v
.v_buf
; i
++, dp
++) {
1544 * The next two lines are needed since NODEV
1545 * is -1 and not NULL
1547 dp
->b_dev
= (o_dev_t
)NODEV
;
1549 dp
->av_forw
= dp
+ 1;
1550 sema_init(&dp
->b_sem
, 1, NULL
, SEMA_DEFAULT
,
1552 sema_init(&dp
->b_io
, 0, NULL
, SEMA_DEFAULT
,
1556 mutex_enter(&bhdr_lock
);
1557 (--dp
)->av_forw
= bhdrlist
; /* Fix last pointer */
1561 bhdrlist
= bp
->av_forw
;
1562 mutex_exit(&bhdr_lock
);
1571 bio_bhdr_free(struct buf
*bp
)
1573 ASSERT(bp
->b_back
== NULL
);
1574 ASSERT(bp
->b_forw
== NULL
);
1575 ASSERT(bp
->av_back
== NULL
);
1576 ASSERT(bp
->av_forw
== NULL
);
1577 ASSERT(bp
->b_un
.b_addr
== NULL
);
1578 ASSERT(bp
->b_dev
== (o_dev_t
)NODEV
);
1579 ASSERT(bp
->b_edev
== NODEV
);
1580 ASSERT(bp
->b_flags
== 0);
1582 mutex_enter(&bhdr_lock
);
1583 bp
->av_forw
= bhdrlist
;
1585 mutex_exit(&bhdr_lock
);
1589 * If we haven't gone over the high water mark, it's o.k. to
1590 * allocate more buffer space, otherwise recycle buffers
1591 * from the freelist until enough memory is free for a bsize request.
1593 * We account for this memory, even though
1594 * we don't allocate it here.
1597 bio_mem_get(long bsize
)
1599 mutex_enter(&bfree_lock
);
1600 if (bfreelist
.b_bufsize
> bsize
) {
1601 bfreelist
.b_bufsize
-= bsize
;
1602 mutex_exit(&bfree_lock
);
1605 mutex_exit(&bfree_lock
);
1606 bio_recycle(BIO_MEM
, bsize
);
1610 * flush a list of delayed write buffers.
1611 * (currently used only by bio_recycle below.)
1614 bio_flushlist(struct buf
*delwri_list
)
1618 while (delwri_list
!= EMPTY_LIST
) {
1620 bp
->b_flags
|= B_AGE
| B_ASYNC
;
1621 if (bp
->b_vp
== NULL
) { /* !ufs */
1624 UFS_BWRITE(VTOI(bp
->b_vp
)->i_ufsvfs
, bp
);
1626 delwri_list
= bp
->b_list
;
1632 * Start recycling buffers on the freelist for one of 2 reasons:
1633 * - we need a buffer header
1634 * - we need to free up memory
1635 * Once started we continue to recycle buffers until the B_AGE
1639 bio_recycle(int want
, long bsize
)
1641 struct buf
*bp
, *dp
, *dwp
, *nbp
;
1646 struct buf
*delwri_list
= EMPTY_LIST
;
1652 start
= end
= lastindex
;
1656 dp
= (struct buf
*)hp
;
1665 if (!sema_tryp(&bp
->b_sem
)) {
1670 * Do we really want to nuke all of the B_AGE stuff??
1672 if ((bp
->b_flags
& B_AGE
) == 0 && found
) {
1676 return; /* All done */
1679 ASSERT(MUTEX_HELD(&hp
->b_lock
));
1680 ASSERT(!(bp
->b_flags
& B_DELWRI
));
1685 * Remove bhdr from cache, free up memory,
1686 * and add the hdr to the freelist.
1691 if (bp
->b_bufsize
) {
1692 kmem_free(bp
->b_un
.b_addr
, bp
->b_bufsize
);
1693 bp
->b_un
.b_addr
= NULL
;
1694 mutex_enter(&bfree_lock
);
1695 bfreelist
.b_bufsize
+= bp
->b_bufsize
;
1696 mutex_exit(&bfree_lock
);
1699 bp
->b_dev
= (o_dev_t
)NODEV
;
1704 if (want
== BIO_HEADER
) {
1707 ASSERT(want
== BIO_MEM
);
1708 if (!found
&& bfreelist
.b_bufsize
>= bsize
) {
1709 /* Account for the memory we want */
1710 mutex_enter(&bfree_lock
);
1711 if (bfreelist
.b_bufsize
>= bsize
) {
1712 bfreelist
.b_bufsize
-= bsize
;
1715 mutex_exit(&bfree_lock
);
1720 * Since we dropped hmp start from the
1729 * Look at the delayed write list.
1730 * First gather into a private list, then write them.
1732 dwp
= (struct buf
*)&dwbuf
[start
];
1733 mutex_enter(&blist_lock
);
1736 for (bp
= dwp
->av_forw
; bp
!= dwp
; bp
= nbp
) {
1741 if (!sema_tryp(&bp
->b_sem
))
1743 ASSERT(bp
->b_flags
& B_DELWRI
);
1745 * Do we really want to nuke all of the B_AGE stuff??
1748 if ((bp
->b_flags
& B_AGE
) == 0 && found
) {
1752 mutex_exit(&blist_lock
);
1753 bio_flushlist(delwri_list
);
1754 mutex_enter(&blist_lock
);
1756 if (bio_flinv_cv_wanted
) {
1757 bio_flinv_cv_wanted
= 0;
1758 cv_broadcast(&bio_flushinval_cv
);
1760 mutex_exit(&blist_lock
);
1761 return; /* All done */
1765 * If the buffer is already on a flush or
1766 * invalidate list then just skip it.
1768 if (bp
->b_list
!= NULL
) {
1773 * We are still on the same bucket.
1777 bp
->b_list
= delwri_list
;
1781 mutex_exit(&blist_lock
);
1782 bio_flushlist(delwri_list
);
1783 delwri_list
= EMPTY_LIST
;
1784 mutex_enter(&blist_lock
);
1786 if (bio_flinv_cv_wanted
) {
1787 bio_flinv_cv_wanted
= 0;
1788 cv_broadcast(&bio_flushinval_cv
);
1790 mutex_exit(&blist_lock
);
1791 start
= (start
+ 1) % v
.v_hbuf
;
1793 } while (start
!= end
);
1799 * Free lists exhausted and we haven't satisfied the request.
1800 * Wait here for more entries to be added to freelist.
1801 * Because this might have just happened, make it timed.
1803 mutex_enter(&bfree_lock
);
1804 bfreelist
.b_flags
|= B_WANTED
;
1805 (void) cv_reltimedwait(&bio_mem_cv
, &bfree_lock
, hz
, TR_CLOCK_TICK
);
1806 mutex_exit(&bfree_lock
);
1811 * See if the block is associated with some buffer
1812 * (mainly to avoid getting hung up on a wait in breada).
1815 bio_incore(dev_t dev
, daddr_t blkno
)
1822 index
= bio_bhash(dev
, blkno
);
1823 dp
= (struct buf
*)&hbuf
[index
];
1824 hmp
= &hbuf
[index
].b_lock
;
1827 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1828 if (bp
->b_blkno
== blkno
&& bp
->b_edev
== dev
&&
1829 (bp
->b_flags
& B_STALE
) == 0) {
1839 bio_pageio_done(struct buf
*bp
)
1841 if (bp
->b_flags
& B_PAGEIO
) {
1843 if (bp
->b_flags
& B_REMAPPED
)
1846 if (bp
->b_flags
& B_READ
)
1847 pvn_read_done(bp
->b_pages
, bp
->b_flags
);
1849 pvn_write_done(bp
->b_pages
, B_WRITE
| bp
->b_flags
);
1852 ASSERT(bp
->b_flags
& B_REMAPPED
);
1859 * bioerror(9F) - indicate error in buffer header
1860 * If 'error' is zero, remove the error indication.
1863 bioerror(struct buf
*bp
, int error
)
1867 ASSERT(SEMA_HELD(&bp
->b_sem
));
1870 bp
->b_flags
|= B_ERROR
;
1872 bp
->b_flags
&= ~B_ERROR
;
1874 bp
->b_error
= error
;
1878 * bioreset(9F) - reuse a private buffer header after I/O is complete
1881 bioreset(struct buf
*bp
)
1890 * biosize(9F) - return size of a buffer header
1895 return (sizeof (struct buf
));
1899 * biomodified(9F) - check if buffer is modified
1902 biomodified(struct buf
*bp
)
1910 if ((bp
->b_flags
& B_PAGEIO
) == 0) {
1914 npf
= btopr(bp
->b_bcount
+ ((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
));
1917 ppattr
= hat_pagesync(pp
, HAT_SYNC_DONTZERO
|
1918 HAT_SYNC_STOPON_MOD
);
1929 * bioinit(9F) - initialize a buffer structure
1932 bioinit(struct buf
*bp
)
1934 bzero(bp
, sizeof (struct buf
));
1935 sema_init(&bp
->b_sem
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1936 sema_init(&bp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1941 * biofini(9F) - uninitialize a buffer structure
1944 biofini(struct buf
*bp
)
1946 sema_destroy(&bp
->b_io
);
1947 sema_destroy(&bp
->b_sem
);
1951 * bioclone(9F) - clone a buffer
1954 bioclone(struct buf
*bp
, off_t off
, size_t len
, dev_t dev
, daddr_t blkno
,
1955 int (*iodone
)(struct buf
*), struct buf
*bp_mem
, int sleep
)
1960 if (bp_mem
== NULL
) {
1961 bufp
= kmem_alloc(sizeof (struct buf
), sleep
);
1971 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
1975 * The cloned buffer does not inherit the B_REMAPPED flag.
1977 bufp
->b_flags
= (bp
->b_flags
& BUF_CLONE_FLAGS
) | B_BUSY
;
1978 bufp
->b_bcount
= len
;
1979 bufp
->b_blkno
= blkno
;
1980 bufp
->b_iodone
= iodone
;
1981 bufp
->b_proc
= bp
->b_proc
;
1983 bufp
->b_file
= bp
->b_file
;
1984 bufp
->b_offset
= bp
->b_offset
;
1986 if (bp
->b_flags
& B_SHADOW
) {
1987 ASSERT(bp
->b_shadow
);
1988 ASSERT(bp
->b_flags
& B_PHYS
);
1990 bufp
->b_shadow
= bp
->b_shadow
+
1991 btop(((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
) + off
);
1992 bufp
->b_un
.b_addr
= (caddr_t
)((uintptr_t)bp
->b_un
.b_addr
+ off
);
1993 if (bp
->b_flags
& B_REMAPPED
)
1994 bufp
->b_proc
= NULL
;
1996 if (bp
->b_flags
& B_PAGEIO
) {
2002 o
= ((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
) + off
;
2003 for (i
= btop(o
); i
> 0; i
--) {
2007 bufp
->b_un
.b_addr
= (caddr_t
)(o
& PAGEOFFSET
);
2010 (caddr_t
)((uintptr_t)bp
->b_un
.b_addr
+ off
);
2011 if (bp
->b_flags
& B_REMAPPED
)
2012 bufp
->b_proc
= NULL
;