4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2011 Joyent, Inc. All rights reserved.
28 * Copyright (c) 2016 by Delphix. All rights reserved.
31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
32 /* All Rights Reserved */
35 * University Copyright- Copyright (c) 1982, 1986, 1988
36 * The Regents of the University of California
39 * University Acknowledgment- Portions of this document are derived from
40 * software developed by the University of California, Berkeley, and its
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/sysmacros.h>
48 #include <sys/cpuvar.h>
49 #include <sys/errno.h>
50 #include <sys/debug.h>
53 #include <sys/vnode.h>
54 #include <sys/bitmap.h>
55 #include <sys/cmn_err.h>
58 #include <sys/atomic.h>
59 #include <vm/seg_kmem.h>
62 #include <sys/vtrace.h>
63 #include <sys/tnf_probe.h>
64 #include <sys/fs/ufs_inode.h>
65 #include <sys/fs/ufs_bio.h>
66 #include <sys/fs/ufs_log.h>
67 #include <sys/systm.h>
72 static kmutex_t blist_lock
; /* protects b_list */
73 static kmutex_t bhdr_lock
; /* protects the bhdrlist */
74 static kmutex_t bfree_lock
; /* protects the bfreelist structure */
76 struct hbuf
*hbuf
; /* Hash buckets */
77 struct dwbuf
*dwbuf
; /* Delayed write buckets */
78 static struct buf
*bhdrlist
; /* buf header free list */
79 static int nbuf
; /* number of buffer headers allocated */
81 static int lastindex
; /* Reference point on where to start */
82 /* when looking for free buffers */
84 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
85 #define EMPTY_LIST ((struct buf *)-1)
87 static kcondvar_t bio_mem_cv
; /* Condition variables */
88 static kcondvar_t bio_flushinval_cv
;
89 static int bio_doingflush
; /* flush in progress */
90 static int bio_doinginval
; /* inval in progress */
91 static int bio_flinv_cv_wanted
; /* someone waiting for cv */
94 * Statistics on the buffer cache
96 struct biostats biostats
= {
97 { "buffer_cache_lookups", KSTAT_DATA_UINT32
},
98 { "buffer_cache_hits", KSTAT_DATA_UINT32
},
99 { "new_buffer_requests", KSTAT_DATA_UINT32
},
100 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32
},
101 { "buffers_locked_by_someone", KSTAT_DATA_UINT32
},
102 { "duplicate_buffers_found", KSTAT_DATA_UINT32
}
108 kstat_named_t
*biostats_ptr
= (kstat_named_t
*)&biostats
;
109 uint_t biostats_ndata
= (uint_t
)(sizeof (biostats
) /
110 sizeof (kstat_named_t
));
113 * Statistics on ufs buffer cache
114 * Not protected by locks
116 struct ufsbiostats ub
= {
117 { "breads", KSTAT_DATA_UINT32
},
118 { "bwrites", KSTAT_DATA_UINT32
},
119 { "fbiwrites", KSTAT_DATA_UINT32
},
120 { "getpages", KSTAT_DATA_UINT32
},
121 { "getras", KSTAT_DATA_UINT32
},
122 { "putsyncs", KSTAT_DATA_UINT32
},
123 { "putasyncs", KSTAT_DATA_UINT32
},
124 { "putpageios", KSTAT_DATA_UINT32
},
128 * more UFS Logging eccentricities...
130 * required since "#pragma weak ..." doesn't work in reverse order.
131 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
132 * to ufs routines don't get plugged into bio.c calls so
133 * we initialize it when setting up the "lufsops" table
134 * in "lufs.c:_init()"
136 void (*bio_lufs_strategy
)(void *, buf_t
*);
137 void (*bio_snapshot_strategy
)(void *, buf_t
*);
140 /* Private routines */
141 static struct buf
*bio_getfreeblk(long);
142 static void bio_mem_get(long);
143 static void bio_bhdr_free(struct buf
*);
144 static struct buf
*bio_bhdr_alloc(void);
145 static void bio_recycle(int, long);
146 static void bio_pageio_done(struct buf
*);
147 static int bio_incore(dev_t
, daddr_t
);
150 * Buffer cache constants
152 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
153 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
154 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
155 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
156 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
157 #define BIO_HASHLEN 4 /* Target length of hash chains */
160 /* Flags for bio_recycle() */
161 #define BIO_HEADER 0x01
164 extern int bufhwm
; /* User tunable - high water mark for mem */
165 extern int bufhwm_pct
; /* ditto - given in % of physmem */
168 * The following routines allocate and free
169 * buffers with various side effects. In general the
170 * arguments to an allocate routine are a device and
171 * a block number, and the value is a pointer to
172 * to the buffer header; the buffer returned is locked with a
173 * binary semaphore so that no one else can touch it. If the block was
174 * already in core, no I/O need be done; if it is
175 * already locked, the process waits until it becomes free.
176 * The following routines allocate a buffer:
180 * Eventually the buffer must be released, possibly with the
181 * side effect of writing it out, by using one of
182 * bwrite/BWRITE/brwrite
187 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
188 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
189 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
190 * B_DONE is still used to denote a buffer with I/O complete on it.
192 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
193 * should not be used where a very accurate count of the free buffers is
198 * Read in (if necessary) the block and return a buffer pointer.
200 * This interface is provided for binary compatibility. Using
201 * BREAD() directly avoids the extra function call overhead invoked
202 * by calling this routine.
205 bread(dev_t dev
, daddr_t blkno
, long bsize
)
207 return (BREAD(dev
, blkno
, bsize
));
211 * Common code for reading a buffer with various options
213 * Read in (if necessary) the block and return a buffer pointer.
216 bread_common(void *arg
, dev_t dev
, daddr_t blkno
, long bsize
)
218 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)arg
;
220 klwp_t
*lwp
= ttolwp(curthread
);
222 CPU_STATS_ADD_K(sys
, lread
, 1);
223 bp
= getblk_common(ufsvfsp
, dev
, blkno
, bsize
, /* errflg */ 1);
224 if (bp
->b_flags
& B_DONE
)
226 bp
->b_flags
|= B_READ
;
227 ASSERT(bp
->b_bcount
== bsize
);
228 if (ufsvfsp
== NULL
) { /* !ufs */
229 (void) bdev_strategy(bp
);
230 } else if (ufsvfsp
->vfs_log
&& bio_lufs_strategy
!= NULL
) {
232 (*bio_lufs_strategy
)(ufsvfsp
->vfs_log
, bp
);
233 } else if (ufsvfsp
->vfs_snapshot
&& bio_snapshot_strategy
!= NULL
) {
234 /* ufs && snapshots */
235 (*bio_snapshot_strategy
)(&ufsvfsp
->vfs_snapshot
, bp
);
237 ufsvfsp
->vfs_iotstamp
= ddi_get_lbolt();
238 ub
.ub_breads
.value
.ul
++; /* ufs && !logging */
239 (void) bdev_strategy(bp
);
242 lwp
->lwp_ru
.inblock
++;
243 CPU_STATS_ADD_K(sys
, bread
, 1);
249 * Read in the block, like bread, but also start I/O on the
250 * read-ahead block (which is not allocated to the caller).
253 breada(dev_t dev
, daddr_t blkno
, daddr_t rablkno
, long bsize
)
255 struct buf
*bp
, *rabp
;
256 klwp_t
*lwp
= ttolwp(curthread
);
259 if (!bio_incore(dev
, blkno
)) {
260 CPU_STATS_ADD_K(sys
, lread
, 1);
261 bp
= GETBLK(dev
, blkno
, bsize
);
262 if ((bp
->b_flags
& B_DONE
) == 0) {
263 bp
->b_flags
|= B_READ
;
264 bp
->b_bcount
= bsize
;
265 (void) bdev_strategy(bp
);
267 lwp
->lwp_ru
.inblock
++;
268 CPU_STATS_ADD_K(sys
, bread
, 1);
271 if (rablkno
&& bfreelist
.b_bcount
> 1 &&
272 !bio_incore(dev
, rablkno
)) {
273 rabp
= GETBLK(dev
, rablkno
, bsize
);
274 if (rabp
->b_flags
& B_DONE
)
277 rabp
->b_flags
|= B_READ
|B_ASYNC
;
278 rabp
->b_bcount
= bsize
;
279 (void) bdev_strategy(rabp
);
281 lwp
->lwp_ru
.inblock
++;
282 CPU_STATS_ADD_K(sys
, bread
, 1);
286 return (BREAD(dev
, blkno
, bsize
));
292 * Common code for writing a buffer with various options.
294 * force_wait - wait for write completion regardless of B_ASYNC flag
295 * do_relse - release the buffer when we are done
296 * clear_flags - flags to clear from the buffer
299 bwrite_common(void *arg
, struct buf
*bp
, int force_wait
,
300 int do_relse
, int clear_flags
)
302 register int do_wait
;
303 struct ufsvfs
*ufsvfsp
= (struct ufsvfs
*)arg
;
305 klwp_t
*lwp
= ttolwp(curthread
);
308 ASSERT(SEMA_HELD(&bp
->b_sem
));
310 bp
->b_flags
&= ~clear_flags
;
312 lwp
->lwp_ru
.oublock
++;
314 cpup
= CPU
; /* get pointer AFTER preemption is disabled */
315 CPU_STATS_ADDQ(cpup
, sys
, lwrite
, 1);
316 CPU_STATS_ADDQ(cpup
, sys
, bwrite
, 1);
317 do_wait
= ((flag
& B_ASYNC
) == 0 || force_wait
);
319 CPU_STATS_ADDQ(cpup
, sys
, bawrite
, 1);
321 if (ufsvfsp
== NULL
) {
322 (void) bdev_strategy(bp
);
323 } else if (ufsvfsp
->vfs_log
&& bio_lufs_strategy
!= NULL
) {
325 (*bio_lufs_strategy
)(ufsvfsp
->vfs_log
, bp
);
326 } else if (ufsvfsp
->vfs_snapshot
&& bio_snapshot_strategy
!= NULL
) {
327 /* ufs && snapshots */
328 (*bio_snapshot_strategy
)(&ufsvfsp
->vfs_snapshot
, bp
);
330 ub
.ub_bwrites
.value
.ul
++; /* ufs && !logging */
331 (void) bdev_strategy(bp
);
342 * Write the buffer, waiting for completion (unless B_ASYNC is set).
343 * Then release the buffer.
344 * This interface is provided for binary compatibility. Using
345 * BWRITE() directly avoids the extra function call overhead invoked
346 * by calling this routine.
349 bwrite(struct buf
*bp
)
355 * Write the buffer, waiting for completion.
356 * But don't release the buffer afterwards.
357 * This interface is provided for binary compatibility. Using
358 * BWRITE2() directly avoids the extra function call overhead.
361 bwrite2(struct buf
*bp
)
367 * Release the buffer, marking it so that if it is grabbed
368 * for another purpose it will be written out before being
369 * given up (e.g. when writing a partial block where it is
370 * assumed that another write for the same block will soon follow).
371 * Also save the time that the block is first marked as delayed
372 * so that it will be written in a reasonable time.
375 bdwrite(struct buf
*bp
)
377 ASSERT(SEMA_HELD(&bp
->b_sem
));
378 CPU_STATS_ADD_K(sys
, lwrite
, 1);
379 if ((bp
->b_flags
& B_DELWRI
) == 0)
380 bp
->b_start
= ddi_get_lbolt();
382 * B_DONE allows others to use the buffer, B_DELWRI causes the
383 * buffer to be written before being reused, and setting b_resid
384 * to zero says the buffer is complete.
386 bp
->b_flags
|= B_DELWRI
| B_DONE
;
392 * Release the buffer, start I/O on it, but don't wait for completion.
395 bawrite(struct buf
*bp
)
397 ASSERT(SEMA_HELD(&bp
->b_sem
));
399 /* Use bfreelist.b_bcount as a weird-ass heuristic */
400 if (bfreelist
.b_bcount
> 4)
401 bp
->b_flags
|= B_ASYNC
;
406 * Release the buffer, with no I/O implied.
409 brelse(struct buf
*bp
)
418 ASSERT(SEMA_HELD(&bp
->b_sem
));
421 * Clear the retry write flag if the buffer was written without
422 * error. The presence of B_DELWRI means the buffer has not yet
423 * been written and the presence of B_ERROR means that an error
424 * is still occurring.
426 if ((bp
->b_flags
& (B_ERROR
| B_DELWRI
| B_RETRYWRI
)) == B_RETRYWRI
) {
427 bp
->b_flags
&= ~B_RETRYWRI
;
430 /* Check for anomalous conditions */
431 if (bp
->b_flags
& (B_ERROR
|B_NOCACHE
)) {
432 if (bp
->b_flags
& B_NOCACHE
) {
433 /* Don't add to the freelist. Destroy it now */
434 kmem_free(bp
->b_un
.b_addr
, bp
->b_bufsize
);
435 sema_destroy(&bp
->b_sem
);
436 sema_destroy(&bp
->b_io
);
437 kmem_free(bp
, sizeof (struct buf
));
441 * If a write failed and we are supposed to retry write,
442 * don't toss the buffer. Keep it around and mark it
443 * delayed write in the hopes that it will eventually
444 * get flushed (and still keep the system running.)
446 if ((bp
->b_flags
& (B_READ
| B_RETRYWRI
)) == B_RETRYWRI
) {
447 bp
->b_flags
|= B_DELWRI
;
448 /* keep fsflush from trying continuously to flush */
449 bp
->b_start
= ddi_get_lbolt();
451 bp
->b_flags
|= B_AGE
|B_STALE
;
452 bp
->b_flags
&= ~B_ERROR
;
457 * If delayed write is set then put in on the delayed
458 * write list instead of the free buffer list.
460 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
461 hmp
= &hbuf
[index
].b_lock
;
465 dp
= (struct buf
*)hp
;
468 * Make sure that the number of entries on this list are
469 * Zero <= count <= total # buffers
471 ASSERT(hp
->b_length
>= 0);
472 ASSERT(hp
->b_length
< nbuf
);
474 hp
->b_length
++; /* We are adding this buffer */
476 if (bp
->b_flags
& B_DELWRI
) {
478 * This buffer goes on the delayed write buffer list
480 dp
= (struct buf
*)&dwbuf
[index
];
482 ASSERT(bp
->b_bufsize
> 0);
483 ASSERT(bp
->b_bcount
> 0);
484 ASSERT(bp
->b_un
.b_addr
!= NULL
);
486 if (bp
->b_flags
& B_AGE
) {
487 backp
= &dp
->av_forw
;
488 (*backp
)->av_back
= bp
;
489 bp
->av_forw
= *backp
;
493 backp
= &dp
->av_back
;
494 (*backp
)->av_forw
= bp
;
495 bp
->av_back
= *backp
;
501 if (bfreelist
.b_flags
& B_WANTED
) {
503 * Should come here very very rarely.
505 mutex_enter(&bfree_lock
);
506 if (bfreelist
.b_flags
& B_WANTED
) {
507 bfreelist
.b_flags
&= ~B_WANTED
;
508 cv_broadcast(&bio_mem_cv
);
510 mutex_exit(&bfree_lock
);
513 bp
->b_flags
&= ~(B_WANTED
|B_BUSY
|B_ASYNC
);
515 * Don't let anyone get the buffer off the freelist before we
516 * release our hold on it.
522 * Return a count of the number of B_BUSY buffers in the system
523 * Can only be used as a good estimate. If 'cleanit' is set,
524 * try to flush all bufs.
527 bio_busy(int cleanit
)
534 for (i
= 0; i
< v
.v_hbuf
; i
++) {
535 dp
= (struct buf
*)&hbuf
[i
];
536 hmp
= &hbuf
[i
].b_lock
;
539 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
540 if (bp
->b_flags
& B_BUSY
)
546 if (cleanit
&& busy
!= 0) {
554 * this interface is provided for binary compatibility.
556 * Assign a buffer for the given block. If the appropriate
557 * block is already associated, return it; otherwise search
558 * for the oldest non-busy buffer and reassign it.
561 getblk(dev_t dev
, daddr_t blkno
, long bsize
)
563 return (getblk_common(/* ufsvfsp */ NULL
, dev
,
564 blkno
, bsize
, /* errflg */ 0));
568 * Assign a buffer for the given block. If the appropriate
569 * block is already associated, return it; otherwise search
570 * for the oldest non-busy buffer and reassign it.
573 getblk_common(void * arg
, dev_t dev
, daddr_t blkno
, long bsize
, int errflg
)
575 ufsvfs_t
*ufsvfsp
= (struct ufsvfs
*)arg
;
578 struct buf
*nbp
= NULL
;
584 if (getmajor(dev
) >= devcnt
)
585 cmn_err(CE_PANIC
, "blkdev");
587 biostats
.bio_lookup
.value
.ui32
++;
589 index
= bio_bhash(dev
, blkno
);
591 dp
= (struct buf
*)hp
;
596 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
597 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
598 (bp
->b_flags
& B_STALE
))
601 * Avoid holding the hash lock in the event that
602 * the buffer is locked by someone. Since the hash chain
603 * may change when we drop the hash lock
604 * we have to start at the beginning of the chain if the
605 * buffer identity/contents aren't valid.
607 if (!sema_tryp(&bp
->b_sem
)) {
608 biostats
.bio_bufbusy
.value
.ui32
++;
611 * OK, we are dealing with a busy buffer.
612 * In the case that we are panicking and we
613 * got called from bread(), we have some chance
614 * for error recovery. So better bail out from
615 * here since sema_p() won't block. If we got
616 * called directly from ufs routines, there is
617 * no way to report an error yet.
619 if (panicstr
&& errflg
)
622 * For the following line of code to work
623 * correctly never kmem_free the buffer "header".
626 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
627 (bp
->b_flags
& B_STALE
)) {
630 goto loop
; /* start over */
635 biostats
.bio_hit
.value
.ui32
++;
636 bp
->b_flags
&= ~B_AGE
;
639 * Yank it off the free/delayed write lists
645 ASSERT((bp
->b_flags
& B_NOCACHE
) == NULL
);
649 * Make the common path short.
651 ASSERT(SEMA_HELD(&bp
->b_sem
));
655 biostats
.bio_bufdup
.value
.ui32
++;
658 * The buffer must have entered during the lock upgrade
659 * so free the new buffer we allocated and return the
662 kmem_free(nbp
->b_un
.b_addr
, nbp
->b_bufsize
);
663 nbp
->b_un
.b_addr
= NULL
;
666 * Account for the memory
668 mutex_enter(&bfree_lock
);
669 bfreelist
.b_bufsize
+= nbp
->b_bufsize
;
670 mutex_exit(&bfree_lock
);
673 * Destroy buf identity, and place on avail list
675 nbp
->b_dev
= (o_dev_t
)NODEV
;
684 ASSERT(SEMA_HELD(&bp
->b_sem
));
689 * bio_getfreeblk may block so check the hash chain again.
693 nbp
= bio_getfreeblk(bsize
);
699 * New buffer. Assign nbp and stick it on the hash.
701 nbp
->b_flags
= B_BUSY
;
703 nbp
->b_dev
= (o_dev_t
)cmpdev(dev
);
704 nbp
->b_blkno
= blkno
;
705 nbp
->b_iodone
= NULL
;
706 nbp
->b_bcount
= bsize
;
708 * If we are given a ufsvfsp and the vfs_root field is NULL
709 * then this must be I/O for a superblock. A superblock's
710 * buffer is set up in mountfs() and there is no root vnode
713 if (ufsvfsp
&& ufsvfsp
->vfs_root
) {
714 nbp
->b_vp
= ufsvfsp
->vfs_root
;
719 ASSERT((nbp
->b_flags
& B_NOCACHE
) == NULL
);
724 ASSERT(SEMA_HELD(&nbp
->b_sem
));
730 * Come here in case of an internal error. At this point we couldn't
731 * get a buffer, but we have to return one. Hence we allocate some
732 * kind of error reply buffer on the fly. This buffer is marked as
733 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
734 * - B_ERROR will indicate error to the caller.
735 * - B_DONE will prevent us from reading the buffer from
737 * - B_NOCACHE will cause that this buffer gets free'd in
743 sema_p(&errbp
->b_sem
);
744 errbp
->b_flags
&= ~B_BUSY
;
745 errbp
->b_flags
|= (B_ERROR
| B_DONE
);
750 * Get an empty block, not assigned to any particular device.
751 * Returns a locked buffer that is not on any hash or free list.
758 bp
= kmem_alloc(sizeof (struct buf
), KM_SLEEP
);
760 bp
->av_forw
= bp
->av_back
= NULL
;
761 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_SLEEP
);
762 bp
->b_bufsize
= bsize
;
763 bp
->b_flags
= B_BUSY
| B_NOCACHE
| B_AGE
;
764 bp
->b_dev
= (o_dev_t
)NODEV
;
767 bp
->b_bcount
= bsize
;
773 * Interface of geteblk() is kept intact to maintain driver compatibility.
774 * Use ngeteblk() to allocate block size other than 1 KB.
779 return (ngeteblk((long)1024));
783 * Return a buffer w/o sleeping
786 trygetblk(dev_t dev
, daddr_t blkno
)
794 index
= bio_bhash(dev
, blkno
);
798 if (!mutex_tryenter(hmp
))
801 dp
= (struct buf
*)hp
;
802 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
803 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
804 (bp
->b_flags
& B_STALE
))
807 * Get access to a valid buffer without sleeping
809 if (sema_tryp(&bp
->b_sem
)) {
810 if (bp
->b_flags
& B_DONE
) {
827 * Wait for I/O completion on the buffer; return errors
831 iowait(struct buf
*bp
)
833 ASSERT(SEMA_HELD(&bp
->b_sem
));
834 return (biowait(bp
));
838 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
839 * and wake up anyone waiting for it.
842 iodone(struct buf
*bp
)
844 ASSERT(SEMA_HELD(&bp
->b_sem
));
849 * Zero the core associated with a buffer.
852 clrbuf(struct buf
*bp
)
854 ASSERT(SEMA_HELD(&bp
->b_sem
));
855 bzero(bp
->b_un
.b_addr
, bp
->b_bcount
);
861 * Make sure all write-behind blocks on dev (or NODEV for all)
869 struct buf
*delwri_list
= EMPTY_LIST
;
873 mutex_enter(&blist_lock
);
875 * Wait for any invalidates or flushes ahead of us to finish.
876 * We really could split blist_lock up per device for better
879 while (bio_doinginval
|| bio_doingflush
) {
880 bio_flinv_cv_wanted
= 1;
881 cv_wait(&bio_flushinval_cv
, &blist_lock
);
885 * Gather all B_DELWRI buffer for device.
886 * Lock ordering is b_sem > hash lock (brelse).
887 * Since we are finding the buffer via the delayed write list,
888 * it may be busy and we would block trying to get the
889 * b_sem lock while holding hash lock. So transfer all the
890 * candidates on the delwri_list and then drop the hash locks.
892 for (i
= 0; i
< v
.v_hbuf
; i
++) {
893 hmp
= &hbuf
[i
].b_lock
;
894 dp
= (struct buf
*)&dwbuf
[i
];
896 for (bp
= dp
->av_forw
; bp
!= dp
; bp
= bp
->av_forw
) {
897 if (dev
== NODEV
|| bp
->b_edev
== dev
) {
898 if (bp
->b_list
== NULL
) {
899 bp
->b_list
= delwri_list
;
906 mutex_exit(&blist_lock
);
909 * Now that the hash locks have been dropped grab the semaphores
910 * and write back all the buffers that have B_DELWRI set.
912 while (delwri_list
!= EMPTY_LIST
) {
915 sema_p(&bp
->b_sem
); /* may block */
916 if ((dev
!= bp
->b_edev
&& dev
!= NODEV
) ||
917 (panicstr
&& bp
->b_flags
& B_BUSY
)) {
919 delwri_list
= bp
->b_list
;
921 continue; /* No longer a candidate */
923 if (bp
->b_flags
& B_DELWRI
) {
924 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
927 dp
= (struct buf
*)hp
;
929 bp
->b_flags
|= B_ASYNC
;
934 if (bp
->b_vp
== NULL
) { /* !ufs */
937 UFS_BWRITE(VTOI(bp
->b_vp
)->i_ufsvfs
, bp
);
942 delwri_list
= bp
->b_list
;
945 mutex_enter(&blist_lock
);
947 if (bio_flinv_cv_wanted
) {
948 bio_flinv_cv_wanted
= 0;
949 cv_broadcast(&bio_flushinval_cv
);
951 mutex_exit(&blist_lock
);
955 * Ensure that a specified block is up-to-date on disk.
958 blkflush(dev_t dev
, daddr_t blkno
)
962 struct buf
*sbp
= NULL
;
966 index
= bio_bhash(dev
, blkno
);
968 dp
= (struct buf
*)hp
;
972 * Identify the buffer in the cache belonging to
973 * this device and blkno (if any).
976 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
977 if (bp
->b_blkno
!= blkno
|| bp
->b_edev
!= dev
||
978 (bp
->b_flags
& B_STALE
))
987 * Now check the buffer we have identified and
988 * make sure it still belongs to the device and is B_DELWRI
991 if (sbp
->b_blkno
== blkno
&& sbp
->b_edev
== dev
&&
992 (sbp
->b_flags
& (B_DELWRI
|B_STALE
)) == B_DELWRI
) {
998 * XXX - There is nothing to guarantee a synchronous
999 * write here if the B_ASYNC flag is set. This needs
1000 * some investigation.
1002 if (sbp
->b_vp
== NULL
) { /* !ufs */
1003 BWRITE(sbp
); /* synchronous write */
1005 UFS_BWRITE(VTOI(sbp
->b_vp
)->i_ufsvfs
, sbp
);
1008 sema_v(&sbp
->b_sem
);
1013 * Same as binval, except can force-invalidate delayed-write buffers
1014 * (which are not be already flushed because of device errors). Also
1015 * makes sure that the retry write flag is cleared.
1018 bfinval(dev_t dev
, int force
)
1022 struct buf
*binval_list
= EMPTY_LIST
;
1028 mutex_enter(&blist_lock
);
1030 * Wait for any flushes ahead of us to finish, it's ok to
1031 * do invalidates in parallel.
1033 while (bio_doingflush
) {
1034 bio_flinv_cv_wanted
= 1;
1035 cv_wait(&bio_flushinval_cv
, &blist_lock
);
1040 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1041 dp
= (struct buf
*)&hbuf
[i
];
1042 hmp
= &hbuf
[i
].b_lock
;
1045 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1046 if (bp
->b_edev
== dev
) {
1047 if (bp
->b_list
== NULL
) {
1048 bp
->b_list
= binval_list
;
1055 mutex_exit(&blist_lock
);
1057 /* Invalidate all bp's found */
1058 while (binval_list
!= EMPTY_LIST
) {
1062 if (bp
->b_edev
== dev
) {
1063 if (force
&& (bp
->b_flags
& B_DELWRI
)) {
1064 /* clear B_DELWRI, move to non-dw freelist */
1065 index
= bio_bhash(bp
->b_edev
, bp
->b_blkno
);
1066 hmp
= &hbuf
[index
].b_lock
;
1067 dp
= (struct buf
*)&hbuf
[index
];
1070 /* remove from delayed write freelist */
1073 /* add to B_AGE side of non-dw freelist */
1074 backp
= &dp
->av_forw
;
1075 (*backp
)->av_back
= bp
;
1076 bp
->av_forw
= *backp
;
1081 * make sure write retries and busy are cleared
1084 ~(B_BUSY
| B_DELWRI
| B_RETRYWRI
);
1087 if ((bp
->b_flags
& B_DELWRI
) == 0)
1088 bp
->b_flags
|= B_STALE
|B_AGE
;
1093 binval_list
= bp
->b_list
;
1096 mutex_enter(&blist_lock
);
1098 if (bio_flinv_cv_wanted
) {
1099 cv_broadcast(&bio_flushinval_cv
);
1100 bio_flinv_cv_wanted
= 0;
1102 mutex_exit(&blist_lock
);
1107 * If possible, invalidate blocks for a dev on demand
1112 (void) bfinval(dev
, 0);
1116 * Initialize the buffer I/O system by freeing
1117 * all buffers and setting all device hash buffer lists to empty.
1123 unsigned int i
, pct
;
1124 ulong_t bio_max_hwm
, bio_default_hwm
;
1127 * Maximum/Default values for bufhwm are set to the smallest of:
1128 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1129 * - 1/4 of kernel virtual memory
1130 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1131 * Additionally, in order to allow simple tuning by percentage of
1132 * physical memory, bufhwm_pct is used to calculate the default if
1133 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1135 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1136 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1138 bio_max_hwm
= MIN(physmem
/ BIO_MAX_PERCENT
,
1139 btop(vmem_size(heap_arena
, VMEM_FREE
)) / 4) * (PAGESIZE
/ 1024);
1140 bio_max_hwm
= MIN(INT32_MAX
, bio_max_hwm
);
1142 pct
= BIO_BUF_PERCENT
;
1143 if (bufhwm_pct
!= 0 &&
1144 ((pct
= 100 / bufhwm_pct
) < BIO_MAX_PERCENT
)) {
1145 pct
= BIO_BUF_PERCENT
;
1147 * Invalid user specified value, emit a warning.
1149 cmn_err(CE_WARN
, "binit: bufhwm_pct(%d) out of \
1150 range(1..%d). Using %d as default.",
1152 100 / BIO_MAX_PERCENT
, 100 / BIO_BUF_PERCENT
);
1155 bio_default_hwm
= MIN(physmem
/ pct
,
1156 btop(vmem_size(heap_arena
, VMEM_FREE
)) / 4) * (PAGESIZE
/ 1024);
1157 bio_default_hwm
= MIN(INT32_MAX
, bio_default_hwm
);
1159 if ((v
.v_bufhwm
= bufhwm
) == 0)
1160 v
.v_bufhwm
= bio_default_hwm
;
1162 if (v
.v_bufhwm
< BIO_MIN_HWM
|| v
.v_bufhwm
> bio_max_hwm
) {
1163 v
.v_bufhwm
= (int)bio_max_hwm
;
1165 * Invalid user specified value, emit a warning.
1168 "binit: bufhwm(%d) out \
1169 of range(%d..%lu). Using %lu as default",
1171 BIO_MIN_HWM
, bio_max_hwm
, bio_max_hwm
);
1175 * Determine the number of hash buckets. Default is to
1176 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1177 * Round up number to the next power of 2.
1179 v
.v_hbuf
= 1 << highbit((((ulong_t
)v
.v_bufhwm
* 1024) / MAXBSIZE
) /
1181 v
.v_hmask
= v
.v_hbuf
- 1;
1182 v
.v_buf
= BIO_BHDR_POOL
;
1184 hbuf
= kmem_zalloc(v
.v_hbuf
* sizeof (struct hbuf
), KM_SLEEP
);
1186 dwbuf
= kmem_zalloc(v
.v_hbuf
* sizeof (struct dwbuf
), KM_SLEEP
);
1188 bfreelist
.b_bufsize
= (size_t)v
.v_bufhwm
* 1024;
1190 bp
->b_forw
= bp
->b_back
= bp
->av_forw
= bp
->av_back
= bp
;
1192 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1193 hbuf
[i
].b_forw
= hbuf
[i
].b_back
= (struct buf
*)&hbuf
[i
];
1194 hbuf
[i
].av_forw
= hbuf
[i
].av_back
= (struct buf
*)&hbuf
[i
];
1197 * Initialize the delayed write buffer list.
1199 dwbuf
[i
].b_forw
= dwbuf
[i
].b_back
= (struct buf
*)&dwbuf
[i
];
1200 dwbuf
[i
].av_forw
= dwbuf
[i
].av_back
= (struct buf
*)&dwbuf
[i
];
1205 * Wait for I/O completion on the buffer; return error code.
1206 * If bp was for synchronous I/O, bp is invalid and associated
1207 * resources are freed on return.
1210 biowait(struct buf
*bp
)
1215 ASSERT(SEMA_HELD(&bp
->b_sem
));
1218 atomic_inc_64(&cpup
->cpu_stats
.sys
.iowait
);
1219 DTRACE_IO1(wait__start
, struct buf
*, bp
);
1222 * In case of panic, busy wait for completion
1225 while ((bp
->b_flags
& B_DONE
) == 0)
1230 DTRACE_IO1(wait__done
, struct buf
*, bp
);
1231 atomic_dec_64(&cpup
->cpu_stats
.sys
.iowait
);
1233 error
= geterror(bp
);
1234 if ((bp
->b_flags
& B_ASYNC
) == 0) {
1235 if (bp
->b_flags
& B_REMAPPED
)
1242 biodone_tnf_probe(struct buf
*bp
)
1245 TNF_PROBE_3(biodone
, "io blockio", /* CSTYLED */,
1246 tnf_device
, device
, bp
->b_edev
,
1247 tnf_diskaddr
, block
, bp
->b_lblkno
,
1248 tnf_opaque
, buf
, bp
);
1252 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1253 * and wake up anyone waiting for it.
1256 biodone(struct buf
*bp
)
1258 if (bp
->b_flags
& B_STARTED
) {
1259 DTRACE_IO1(done
, struct buf
*, bp
);
1260 bp
->b_flags
&= ~B_STARTED
;
1264 * Call the TNF probe here instead of the inline code
1265 * to force our compiler to use the tail call optimization.
1267 biodone_tnf_probe(bp
);
1269 if (bp
->b_iodone
!= NULL
) {
1270 (*(bp
->b_iodone
))(bp
);
1273 ASSERT((bp
->b_flags
& B_DONE
) == 0);
1274 ASSERT(SEMA_HELD(&bp
->b_sem
));
1275 bp
->b_flags
|= B_DONE
;
1276 if (bp
->b_flags
& B_ASYNC
) {
1277 if (bp
->b_flags
& (B_PAGEIO
|B_REMAPPED
))
1278 bio_pageio_done(bp
);
1280 brelse(bp
); /* release bp to freelist */
1287 * Pick up the device's error number and pass it to the user;
1288 * if there is an error but the number is 0 set a generalized code.
1291 geterror(struct buf
*bp
)
1295 ASSERT(SEMA_HELD(&bp
->b_sem
));
1296 if (bp
->b_flags
& B_ERROR
) {
1297 error
= bp
->b_error
;
1305 * Support for pageio buffers.
1307 * This stuff should be generalized to provide a generalized bp
1308 * header facility that can be used for things other than pageio.
1312 * Allocate and initialize a buf struct for use with pageio.
1315 pageio_setup(struct page
*pp
, size_t len
, struct vnode
*vp
, int flags
)
1320 if (flags
& B_READ
) {
1321 CPU_STATS_ENTER_K();
1322 cpup
= CPU
; /* get pointer AFTER preemption is disabled */
1323 CPU_STATS_ADDQ(cpup
, vm
, pgin
, 1);
1324 CPU_STATS_ADDQ(cpup
, vm
, pgpgin
, btopr(len
));
1326 atomic_add_64(&curzone
->zone_pgpgin
, btopr(len
));
1328 if ((flags
& B_ASYNC
) == 0) {
1329 klwp_t
*lwp
= ttolwp(curthread
);
1331 lwp
->lwp_ru
.majflt
++;
1332 CPU_STATS_ADDQ(cpup
, vm
, maj_fault
, 1);
1334 TNF_PROBE_2(major_fault
, "vm pagefault", /* CSTYLED */,
1335 tnf_opaque
, vnode
, pp
->p_vnode
,
1336 tnf_offset
, offset
, pp
->p_offset
);
1339 * Update statistics for pages being paged in
1341 if (pp
!= NULL
&& pp
->p_vnode
!= NULL
) {
1342 if (IS_SWAPFSVP(pp
->p_vnode
)) {
1343 CPU_STATS_ADDQ(cpup
, vm
, anonpgin
, btopr(len
));
1344 atomic_add_64(&curzone
->zone_anonpgin
,
1347 if (pp
->p_vnode
->v_flag
& VVMEXEC
) {
1348 CPU_STATS_ADDQ(cpup
, vm
, execpgin
,
1350 atomic_add_64(&curzone
->zone_execpgin
,
1353 CPU_STATS_ADDQ(cpup
, vm
, fspgin
,
1355 atomic_add_64(&curzone
->zone_fspgin
,
1361 TRACE_1(TR_FAC_VM
, TR_PAGE_WS_IN
,
1362 "page_ws_in:pp %p", pp
);
1364 TNF_PROBE_3(pagein
, "vm pageio io", /* CSTYLED */,
1365 tnf_opaque
, vnode
, pp
->p_vnode
,
1366 tnf_offset
, offset
, pp
->p_offset
,
1367 tnf_size
, size
, len
);
1370 bp
= kmem_zalloc(sizeof (struct buf
), KM_SLEEP
);
1372 bp
->b_bufsize
= len
;
1374 bp
->b_flags
= B_PAGEIO
| B_NOCACHE
| B_BUSY
| flags
;
1376 sema_init(&bp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1378 /* Initialize bp->b_sem in "locked" state */
1379 sema_init(&bp
->b_sem
, 0, NULL
, SEMA_DEFAULT
, NULL
);
1383 THREAD_KPRI_RELEASE_N(btopr(len
)); /* release kpri from page_locks */
1386 * Caller sets dev & blkno and can adjust
1387 * b_addr for page offset and can use bp_mapin
1388 * to make pages kernel addressable.
1394 pageio_done(struct buf
*bp
)
1396 ASSERT(SEMA_HELD(&bp
->b_sem
));
1397 if (bp
->b_flags
& B_REMAPPED
)
1401 ASSERT((bp
->b_flags
& B_NOCACHE
) != 0);
1403 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1404 sema_destroy(&bp
->b_sem
);
1405 sema_destroy(&bp
->b_io
);
1406 kmem_free(bp
, sizeof (struct buf
));
1410 * Check to see whether the buffers, except the one pointed by sbp,
1411 * associated with the device are busy.
1412 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1415 bcheck(dev_t dev
, struct buf
*sbp
)
1423 * check for busy bufs for this filesystem
1425 for (i
= 0; i
< v
.v_hbuf
; i
++) {
1426 dp
= (struct buf
*)&hbuf
[i
];
1427 hmp
= &hbuf
[i
].b_lock
;
1430 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1432 * if buf is busy or dirty, then filesystem is busy
1434 if ((bp
->b_edev
== dev
) &&
1435 ((bp
->b_flags
& B_STALE
) == 0) &&
1436 (bp
->b_flags
& (B_DELWRI
|B_BUSY
)) &&
1448 * Hash two 32 bit entities.
1451 hash2ints(int x
, int y
)
1456 hash
= ((hash
* 7) + (x
>> 8)) - 1;
1457 hash
= ((hash
* 7) + (x
>> 16)) - 1;
1458 hash
= ((hash
* 7) + (x
>> 24)) - 1;
1459 hash
= ((hash
* 7) + y
) - 1;
1460 hash
= ((hash
* 7) + (y
>> 8)) - 1;
1461 hash
= ((hash
* 7) + (y
>> 16)) - 1;
1462 hash
= ((hash
* 7) + (y
>> 24)) - 1;
1469 * Return a new buffer struct.
1470 * Create a new buffer if we haven't gone over our high water
1471 * mark for memory, otherwise try to get one off the freelist.
1473 * Returns a locked buf that has no id and is not on any hash or free
1477 bio_getfreeblk(long bsize
)
1479 struct buf
*bp
, *dp
;
1485 * mutex_enter(&bfree_lock);
1486 * bfreelist.b_bufsize represents the amount of memory
1487 * mutex_exit(&bfree_lock); protect ref to bfreelist
1488 * we are allowed to allocate in the cache before we hit our hwm.
1490 bio_mem_get(bsize
); /* Account for our memory request */
1493 bp
= bio_bhdr_alloc(); /* Get a buf hdr */
1494 sema_p(&bp
->b_sem
); /* Should never fail */
1496 ASSERT(bp
->b_un
.b_addr
== NULL
);
1497 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_NOSLEEP
);
1498 if (bp
->b_un
.b_addr
!= NULL
) {
1500 * Make the common path short
1502 bp
->b_bufsize
= bsize
;
1503 ASSERT(SEMA_HELD(&bp
->b_sem
));
1508 save
= bp
; /* Save bp we allocated */
1509 start
= end
= lastindex
;
1511 biostats
.bio_bufwant
.value
.ui32
++;
1514 * Memory isn't available from the system now. Scan
1515 * the hash buckets till enough space is found.
1520 dp
= (struct buf
*)hp
;
1529 if (!sema_tryp(&bp
->b_sem
)) {
1535 * Since we are going down the freelist
1536 * associated with this hash bucket the
1537 * B_DELWRI flag should not be set.
1539 ASSERT(!(bp
->b_flags
& B_DELWRI
));
1541 if (bp
->b_bufsize
== bsize
) {
1548 * Didn't kmem_alloc any more, so don't
1551 mutex_enter(&bfree_lock
);
1552 bfreelist
.b_bufsize
+= bsize
;
1553 mutex_exit(&bfree_lock
);
1556 * Update the lastindex value.
1561 * Put our saved bp back on the list
1563 sema_v(&save
->b_sem
);
1564 bio_bhdr_free(save
);
1565 ASSERT(SEMA_HELD(&bp
->b_sem
));
1572 start
= ((start
+ 1) % v
.v_hbuf
);
1573 } while (start
!= end
);
1575 biostats
.bio_bufwait
.value
.ui32
++;
1576 bp
= save
; /* Use original bp */
1577 bp
->b_un
.b_addr
= kmem_alloc(bsize
, KM_SLEEP
);
1580 bp
->b_bufsize
= bsize
;
1581 ASSERT(SEMA_HELD(&bp
->b_sem
));
1586 * Allocate a buffer header. If none currently available, allocate
1590 bio_bhdr_alloc(void)
1592 struct buf
*dp
, *sdp
;
1597 mutex_enter(&bhdr_lock
);
1598 if (bhdrlist
!= NULL
) {
1600 bhdrlist
= bp
->av_forw
;
1601 mutex_exit(&bhdr_lock
);
1605 mutex_exit(&bhdr_lock
);
1608 * Need to allocate a new pool. If the system is currently
1609 * out of memory, then try freeing things on the freelist.
1611 dp
= kmem_zalloc(sizeof (struct buf
) * v
.v_buf
, KM_NOSLEEP
);
1614 * System can't give us a pool of headers, try
1615 * recycling from the free lists.
1617 bio_recycle(BIO_HEADER
, 0);
1620 for (i
= 0; i
< v
.v_buf
; i
++, dp
++) {
1622 * The next two lines are needed since NODEV
1623 * is -1 and not NULL
1625 dp
->b_dev
= (o_dev_t
)NODEV
;
1627 dp
->av_forw
= dp
+ 1;
1628 sema_init(&dp
->b_sem
, 1, NULL
, SEMA_DEFAULT
,
1630 sema_init(&dp
->b_io
, 0, NULL
, SEMA_DEFAULT
,
1634 mutex_enter(&bhdr_lock
);
1635 (--dp
)->av_forw
= bhdrlist
; /* Fix last pointer */
1639 bhdrlist
= bp
->av_forw
;
1640 mutex_exit(&bhdr_lock
);
1649 bio_bhdr_free(struct buf
*bp
)
1651 ASSERT(bp
->b_back
== NULL
);
1652 ASSERT(bp
->b_forw
== NULL
);
1653 ASSERT(bp
->av_back
== NULL
);
1654 ASSERT(bp
->av_forw
== NULL
);
1655 ASSERT(bp
->b_un
.b_addr
== NULL
);
1656 ASSERT(bp
->b_dev
== (o_dev_t
)NODEV
);
1657 ASSERT(bp
->b_edev
== NODEV
);
1658 ASSERT(bp
->b_flags
== 0);
1660 mutex_enter(&bhdr_lock
);
1661 bp
->av_forw
= bhdrlist
;
1663 mutex_exit(&bhdr_lock
);
1667 * If we haven't gone over the high water mark, it's o.k. to
1668 * allocate more buffer space, otherwise recycle buffers
1669 * from the freelist until enough memory is free for a bsize request.
1671 * We account for this memory, even though
1672 * we don't allocate it here.
1675 bio_mem_get(long bsize
)
1677 mutex_enter(&bfree_lock
);
1678 if (bfreelist
.b_bufsize
> bsize
) {
1679 bfreelist
.b_bufsize
-= bsize
;
1680 mutex_exit(&bfree_lock
);
1683 mutex_exit(&bfree_lock
);
1684 bio_recycle(BIO_MEM
, bsize
);
1688 * flush a list of delayed write buffers.
1689 * (currently used only by bio_recycle below.)
1692 bio_flushlist(struct buf
*delwri_list
)
1696 while (delwri_list
!= EMPTY_LIST
) {
1698 bp
->b_flags
|= B_AGE
| B_ASYNC
;
1699 if (bp
->b_vp
== NULL
) { /* !ufs */
1702 UFS_BWRITE(VTOI(bp
->b_vp
)->i_ufsvfs
, bp
);
1704 delwri_list
= bp
->b_list
;
1710 * Start recycling buffers on the freelist for one of 2 reasons:
1711 * - we need a buffer header
1712 * - we need to free up memory
1713 * Once started we continue to recycle buffers until the B_AGE
1717 bio_recycle(int want
, long bsize
)
1719 struct buf
*bp
, *dp
, *dwp
, *nbp
;
1724 struct buf
*delwri_list
= EMPTY_LIST
;
1730 start
= end
= lastindex
;
1734 dp
= (struct buf
*)hp
;
1743 if (!sema_tryp(&bp
->b_sem
)) {
1748 * Do we really want to nuke all of the B_AGE stuff??
1750 if ((bp
->b_flags
& B_AGE
) == 0 && found
) {
1754 return; /* All done */
1757 ASSERT(MUTEX_HELD(&hp
->b_lock
));
1758 ASSERT(!(bp
->b_flags
& B_DELWRI
));
1763 * Remove bhdr from cache, free up memory,
1764 * and add the hdr to the freelist.
1769 if (bp
->b_bufsize
) {
1770 kmem_free(bp
->b_un
.b_addr
, bp
->b_bufsize
);
1771 bp
->b_un
.b_addr
= NULL
;
1772 mutex_enter(&bfree_lock
);
1773 bfreelist
.b_bufsize
+= bp
->b_bufsize
;
1774 mutex_exit(&bfree_lock
);
1777 bp
->b_dev
= (o_dev_t
)NODEV
;
1782 if (want
== BIO_HEADER
) {
1785 ASSERT(want
== BIO_MEM
);
1786 if (!found
&& bfreelist
.b_bufsize
>= bsize
) {
1787 /* Account for the memory we want */
1788 mutex_enter(&bfree_lock
);
1789 if (bfreelist
.b_bufsize
>= bsize
) {
1790 bfreelist
.b_bufsize
-= bsize
;
1793 mutex_exit(&bfree_lock
);
1798 * Since we dropped hmp start from the
1807 * Look at the delayed write list.
1808 * First gather into a private list, then write them.
1810 dwp
= (struct buf
*)&dwbuf
[start
];
1811 mutex_enter(&blist_lock
);
1814 for (bp
= dwp
->av_forw
; bp
!= dwp
; bp
= nbp
) {
1819 if (!sema_tryp(&bp
->b_sem
))
1821 ASSERT(bp
->b_flags
& B_DELWRI
);
1823 * Do we really want to nuke all of the B_AGE stuff??
1826 if ((bp
->b_flags
& B_AGE
) == 0 && found
) {
1830 mutex_exit(&blist_lock
);
1831 bio_flushlist(delwri_list
);
1832 mutex_enter(&blist_lock
);
1834 if (bio_flinv_cv_wanted
) {
1835 bio_flinv_cv_wanted
= 0;
1836 cv_broadcast(&bio_flushinval_cv
);
1838 mutex_exit(&blist_lock
);
1839 return; /* All done */
1843 * If the buffer is already on a flush or
1844 * invalidate list then just skip it.
1846 if (bp
->b_list
!= NULL
) {
1851 * We are still on the same bucket.
1855 bp
->b_list
= delwri_list
;
1859 mutex_exit(&blist_lock
);
1860 bio_flushlist(delwri_list
);
1861 delwri_list
= EMPTY_LIST
;
1862 mutex_enter(&blist_lock
);
1864 if (bio_flinv_cv_wanted
) {
1865 bio_flinv_cv_wanted
= 0;
1866 cv_broadcast(&bio_flushinval_cv
);
1868 mutex_exit(&blist_lock
);
1869 start
= (start
+ 1) % v
.v_hbuf
;
1871 } while (start
!= end
);
1877 * Free lists exhausted and we haven't satisfied the request.
1878 * Wait here for more entries to be added to freelist.
1879 * Because this might have just happened, make it timed.
1881 mutex_enter(&bfree_lock
);
1882 bfreelist
.b_flags
|= B_WANTED
;
1883 (void) cv_reltimedwait(&bio_mem_cv
, &bfree_lock
, hz
, TR_CLOCK_TICK
);
1884 mutex_exit(&bfree_lock
);
1889 * See if the block is associated with some buffer
1890 * (mainly to avoid getting hung up on a wait in breada).
1893 bio_incore(dev_t dev
, daddr_t blkno
)
1900 index
= bio_bhash(dev
, blkno
);
1901 dp
= (struct buf
*)&hbuf
[index
];
1902 hmp
= &hbuf
[index
].b_lock
;
1905 for (bp
= dp
->b_forw
; bp
!= dp
; bp
= bp
->b_forw
) {
1906 if (bp
->b_blkno
== blkno
&& bp
->b_edev
== dev
&&
1907 (bp
->b_flags
& B_STALE
) == 0) {
1917 bio_pageio_done(struct buf
*bp
)
1919 if (bp
->b_flags
& B_PAGEIO
) {
1921 if (bp
->b_flags
& B_REMAPPED
)
1924 if (bp
->b_flags
& B_READ
)
1925 pvn_read_done(bp
->b_pages
, bp
->b_flags
);
1927 pvn_write_done(bp
->b_pages
, B_WRITE
| bp
->b_flags
);
1930 ASSERT(bp
->b_flags
& B_REMAPPED
);
1937 * bioerror(9F) - indicate error in buffer header
1938 * If 'error' is zero, remove the error indication.
1941 bioerror(struct buf
*bp
, int error
)
1945 ASSERT(SEMA_HELD(&bp
->b_sem
));
1948 bp
->b_flags
|= B_ERROR
;
1950 bp
->b_flags
&= ~B_ERROR
;
1952 bp
->b_error
= error
;
1956 * bioreset(9F) - reuse a private buffer header after I/O is complete
1959 bioreset(struct buf
*bp
)
1968 * biosize(9F) - return size of a buffer header
1973 return (sizeof (struct buf
));
1977 * biomodified(9F) - check if buffer is modified
1980 biomodified(struct buf
*bp
)
1988 if ((bp
->b_flags
& B_PAGEIO
) == 0) {
1992 npf
= btopr(bp
->b_bcount
+ ((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
));
1995 ppattr
= hat_pagesync(pp
, HAT_SYNC_DONTZERO
|
1996 HAT_SYNC_STOPON_MOD
);
2007 * bioinit(9F) - initialize a buffer structure
2010 bioinit(struct buf
*bp
)
2012 bzero(bp
, sizeof (struct buf
));
2013 sema_init(&bp
->b_sem
, 0, NULL
, SEMA_DEFAULT
, NULL
);
2014 sema_init(&bp
->b_io
, 0, NULL
, SEMA_DEFAULT
, NULL
);
2019 * biofini(9F) - uninitialize a buffer structure
2022 biofini(struct buf
*bp
)
2024 sema_destroy(&bp
->b_io
);
2025 sema_destroy(&bp
->b_sem
);
2029 * bioclone(9F) - clone a buffer
2032 bioclone(struct buf
*bp
, off_t off
, size_t len
, dev_t dev
, daddr_t blkno
,
2033 int (*iodone
)(struct buf
*), struct buf
*bp_mem
, int sleep
)
2038 if (bp_mem
== NULL
) {
2039 bufp
= kmem_alloc(sizeof (struct buf
), sleep
);
2049 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
2053 * The cloned buffer does not inherit the B_REMAPPED flag.
2055 bufp
->b_flags
= (bp
->b_flags
& BUF_CLONE_FLAGS
) | B_BUSY
;
2056 bufp
->b_bcount
= len
;
2057 bufp
->b_blkno
= blkno
;
2058 bufp
->b_iodone
= iodone
;
2059 bufp
->b_proc
= bp
->b_proc
;
2061 bufp
->b_file
= bp
->b_file
;
2062 bufp
->b_offset
= bp
->b_offset
;
2064 if (bp
->b_flags
& B_SHADOW
) {
2065 ASSERT(bp
->b_shadow
);
2066 ASSERT(bp
->b_flags
& B_PHYS
);
2068 bufp
->b_shadow
= bp
->b_shadow
+
2069 btop(((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
) + off
);
2070 bufp
->b_un
.b_addr
= (caddr_t
)((uintptr_t)bp
->b_un
.b_addr
+ off
);
2071 if (bp
->b_flags
& B_REMAPPED
)
2072 bufp
->b_proc
= NULL
;
2074 if (bp
->b_flags
& B_PAGEIO
) {
2080 o
= ((uintptr_t)bp
->b_un
.b_addr
& PAGEOFFSET
) + off
;
2081 for (i
= btop(o
); i
> 0; i
--) {
2085 bufp
->b_un
.b_addr
= (caddr_t
)(o
& PAGEOFFSET
);
2088 (caddr_t
)((uintptr_t)bp
->b_un
.b_addr
+ off
);
2089 if (bp
->b_flags
& B_REMAPPED
)
2090 bufp
->b_proc
= NULL
;