kernel: getblk_common's 5th arg can be a bool
[unleashed.git] / kernel / os / bio.c
blob675e60a4ba564b6dc6f500397c180757467bfaf1
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2011 Joyent, Inc. All rights reserved.
28 * Copyright (c) 2016 by Delphix. All rights reserved.
31 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
32 /* All Rights Reserved */
35 * University Copyright- Copyright (c) 1982, 1986, 1988
36 * The Regents of the University of California
37 * All Rights Reserved
39 * University Acknowledgment- Portions of this document are derived from
40 * software developed by the University of California, Berkeley, and its
41 * contributors.
44 #include <sys/types.h>
45 #include <sys/t_lock.h>
46 #include <sys/sysmacros.h>
47 #include <sys/conf.h>
48 #include <sys/cpuvar.h>
49 #include <sys/errno.h>
50 #include <sys/debug.h>
51 #include <sys/buf.h>
52 #include <sys/var.h>
53 #include <sys/vnode.h>
54 #include <sys/bitmap.h>
55 #include <sys/cmn_err.h>
56 #include <sys/kmem.h>
57 #include <sys/vmem.h>
58 #include <sys/atomic.h>
59 #include <vm/seg_kmem.h>
60 #include <vm/page.h>
61 #include <vm/pvn.h>
62 #include <sys/vtrace.h>
63 #include <sys/fs/ufs_inode.h>
64 #include <sys/fs/ufs_bio.h>
65 #include <sys/fs/ufs_log.h>
66 #include <sys/systm.h>
67 #include <sys/vfs.h>
68 #include <sys/sdt.h>
70 /* Locks */
71 static kmutex_t blist_lock; /* protects b_list */
72 static kmutex_t bhdr_lock; /* protects the bhdrlist */
73 static kmutex_t bfree_lock; /* protects the bfreelist structure */
75 struct hbuf *hbuf; /* Hash buckets */
76 struct dwbuf *dwbuf; /* Delayed write buckets */
77 static struct buf *bhdrlist; /* buf header free list */
78 static int nbuf; /* number of buffer headers allocated */
80 static int lastindex; /* Reference point on where to start */
81 /* when looking for free buffers */
83 #define bio_bhash(dev, bn) (hash2ints((dev), (int)(bn)) & v.v_hmask)
84 #define EMPTY_LIST ((struct buf *)-1)
86 static kcondvar_t bio_mem_cv; /* Condition variables */
87 static kcondvar_t bio_flushinval_cv;
88 static int bio_doingflush; /* flush in progress */
89 static int bio_doinginval; /* inval in progress */
90 static int bio_flinv_cv_wanted; /* someone waiting for cv */
93 * Statistics on the buffer cache
95 struct biostats biostats = {
96 { "buffer_cache_lookups", KSTAT_DATA_UINT32 },
97 { "buffer_cache_hits", KSTAT_DATA_UINT32 },
98 { "new_buffer_requests", KSTAT_DATA_UINT32 },
99 { "waits_for_buffer_allocs", KSTAT_DATA_UINT32 },
100 { "buffers_locked_by_someone", KSTAT_DATA_UINT32 },
101 { "duplicate_buffers_found", KSTAT_DATA_UINT32 }
105 * kstat data
107 kstat_named_t *biostats_ptr = (kstat_named_t *)&biostats;
108 uint_t biostats_ndata = (uint_t)(sizeof (biostats) /
109 sizeof (kstat_named_t));
112 * Statistics on ufs buffer cache
113 * Not protected by locks
115 struct ufsbiostats ub = {
116 { "breads", KSTAT_DATA_UINT32 },
117 { "bwrites", KSTAT_DATA_UINT32 },
118 { "fbiwrites", KSTAT_DATA_UINT32 },
119 { "getpages", KSTAT_DATA_UINT32 },
120 { "getras", KSTAT_DATA_UINT32 },
121 { "putsyncs", KSTAT_DATA_UINT32 },
122 { "putasyncs", KSTAT_DATA_UINT32 },
123 { "putpageios", KSTAT_DATA_UINT32 },
127 * more UFS Logging eccentricities...
129 * required since "#pragma weak ..." doesn't work in reverse order.
130 * i.e.: genunix (bio.c) is loaded before the ufs modules and pointers
131 * to ufs routines don't get plugged into bio.c calls so
132 * we initialize it when setting up the "lufsops" table
133 * in "lufs.c:_init()"
135 void (*bio_lufs_strategy)(void *, buf_t *);
136 void (*bio_snapshot_strategy)(void *, buf_t *);
139 /* Private routines */
140 static struct buf *bio_getfreeblk(long);
141 static void bio_mem_get(long);
142 static void bio_bhdr_free(struct buf *);
143 static struct buf *bio_bhdr_alloc(void);
144 static void bio_recycle(int, long);
145 static void bio_pageio_done(struct buf *);
146 static int bio_incore(dev_t, daddr_t);
149 * Buffer cache constants
151 #define BIO_BUF_PERCENT (100/2) /* default: 2% of memory */
152 #define BIO_MAX_PERCENT (100/20) /* max is 20% of real memory */
153 #define BIO_BHDR_POOL 100 /* Default bhdr pool size */
154 #define BIO_MIN_HDR 10 /* Minimum number of buffer headers */
155 #define BIO_MIN_HWM (BIO_MIN_HDR * MAXBSIZE / 1024)
156 #define BIO_HASHLEN 4 /* Target length of hash chains */
159 /* Flags for bio_recycle() */
160 #define BIO_HEADER 0x01
161 #define BIO_MEM 0x02
163 extern int bufhwm; /* User tunable - high water mark for mem */
164 extern int bufhwm_pct; /* ditto - given in % of physmem */
167 * The following routines allocate and free
168 * buffers with various side effects. In general the
169 * arguments to an allocate routine are a device and
170 * a block number, and the value is a pointer to
171 * to the buffer header; the buffer returned is locked with a
172 * binary semaphore so that no one else can touch it. If the block was
173 * already in core, no I/O need be done; if it is
174 * already locked, the process waits until it becomes free.
175 * The following routines allocate a buffer:
176 * getblk
177 * bread/BREAD
178 * breada
179 * Eventually the buffer must be released, possibly with the
180 * side effect of writing it out, by using one of
181 * bwrite/BWRITE/brwrite
182 * bdwrite/bdrwrite
183 * bawrite
184 * brelse
186 * The B_WANTED/B_BUSY bits are NOT used by these routines for synchronization.
187 * Instead, a binary semaphore, b_sem is used to gain exclusive access to
188 * a buffer and a binary semaphore, b_io is used for I/O synchronization.
189 * B_DONE is still used to denote a buffer with I/O complete on it.
191 * The bfreelist.b_bcount field is computed everytime fsflush runs. It is
192 * should not be used where a very accurate count of the free buffers is
193 * needed.
197 * Common code for reading a buffer with various options
199 * Read in (if necessary) the block and return a buffer pointer.
201 struct buf *
202 bread_common(void *arg, dev_t dev, daddr_t blkno, long bsize)
204 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
205 struct buf *bp;
206 klwp_t *lwp = ttolwp(curthread);
208 CPU_STATS_ADD_K(sys, lread, 1);
209 bp = getblk_common(ufsvfsp, dev, blkno, bsize, true);
210 if (bp->b_flags & B_DONE)
211 return (bp);
212 bp->b_flags |= B_READ;
213 ASSERT(bp->b_bcount == bsize);
214 if (ufsvfsp == NULL) { /* !ufs */
215 (void) bdev_strategy(bp);
216 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
217 /* ufs && logging */
218 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
219 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
220 /* ufs && snapshots */
221 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
222 } else {
223 ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
224 ub.ub_breads.value.ul++; /* ufs && !logging */
225 (void) bdev_strategy(bp);
227 if (lwp != NULL)
228 lwp->lwp_ru.inblock++;
229 CPU_STATS_ADD_K(sys, bread, 1);
230 (void) biowait(bp);
231 return (bp);
235 * Read in the block, like bread, but also start I/O on the
236 * read-ahead block (which is not allocated to the caller).
238 struct buf *
239 breada(dev_t dev, daddr_t blkno, daddr_t rablkno, long bsize)
241 struct buf *bp, *rabp;
242 klwp_t *lwp = ttolwp(curthread);
244 bp = NULL;
245 if (!bio_incore(dev, blkno)) {
246 CPU_STATS_ADD_K(sys, lread, 1);
247 bp = getblk(dev, blkno, bsize);
248 if ((bp->b_flags & B_DONE) == 0) {
249 bp->b_flags |= B_READ;
250 bp->b_bcount = bsize;
251 (void) bdev_strategy(bp);
252 if (lwp != NULL)
253 lwp->lwp_ru.inblock++;
254 CPU_STATS_ADD_K(sys, bread, 1);
257 if (rablkno && bfreelist.b_bcount > 1 &&
258 !bio_incore(dev, rablkno)) {
259 rabp = getblk(dev, rablkno, bsize);
260 if (rabp->b_flags & B_DONE)
261 brelse(rabp);
262 else {
263 rabp->b_flags |= B_READ|B_ASYNC;
264 rabp->b_bcount = bsize;
265 (void) bdev_strategy(rabp);
266 if (lwp != NULL)
267 lwp->lwp_ru.inblock++;
268 CPU_STATS_ADD_K(sys, bread, 1);
271 if (bp == NULL)
272 return (bread(dev, blkno, bsize));
273 (void) biowait(bp);
274 return (bp);
278 * Common code for writing a buffer with various options.
280 * force_wait - wait for write completion regardless of B_ASYNC flag
281 * do_relse - release the buffer when we are done
282 * clear_flags - flags to clear from the buffer
284 void
285 bwrite_common(void *arg, struct buf *bp, bool force_wait, bool do_relse,
286 int clear_flags)
288 register int do_wait;
289 struct ufsvfs *ufsvfsp = (struct ufsvfs *)arg;
290 int flag;
291 klwp_t *lwp = ttolwp(curthread);
292 struct cpu *cpup;
294 ASSERT(SEMA_HELD(&bp->b_sem));
295 flag = bp->b_flags;
296 bp->b_flags &= ~clear_flags;
297 if (lwp != NULL)
298 lwp->lwp_ru.oublock++;
299 CPU_STATS_ENTER_K();
300 cpup = CPU; /* get pointer AFTER preemption is disabled */
301 CPU_STATS_ADDQ(cpup, sys, lwrite, 1);
302 CPU_STATS_ADDQ(cpup, sys, bwrite, 1);
303 do_wait = ((flag & B_ASYNC) == 0 || force_wait);
304 if (do_wait == 0)
305 CPU_STATS_ADDQ(cpup, sys, bawrite, 1);
306 CPU_STATS_EXIT_K();
307 if (ufsvfsp == NULL) {
308 (void) bdev_strategy(bp);
309 } else if (ufsvfsp->vfs_log && bio_lufs_strategy != NULL) {
310 /* ufs && logging */
311 (*bio_lufs_strategy)(ufsvfsp->vfs_log, bp);
312 } else if (ufsvfsp->vfs_snapshot && bio_snapshot_strategy != NULL) {
313 /* ufs && snapshots */
314 (*bio_snapshot_strategy)(&ufsvfsp->vfs_snapshot, bp);
315 } else {
316 ub.ub_bwrites.value.ul++; /* ufs && !logging */
317 (void) bdev_strategy(bp);
319 if (do_wait) {
320 (void) biowait(bp);
321 if (do_relse) {
322 brelse(bp);
328 * Release the buffer, marking it so that if it is grabbed
329 * for another purpose it will be written out before being
330 * given up (e.g. when writing a partial block where it is
331 * assumed that another write for the same block will soon follow).
332 * Also save the time that the block is first marked as delayed
333 * so that it will be written in a reasonable time.
335 void
336 bdwrite(struct buf *bp)
338 ASSERT(SEMA_HELD(&bp->b_sem));
339 CPU_STATS_ADD_K(sys, lwrite, 1);
340 if ((bp->b_flags & B_DELWRI) == 0)
341 bp->b_start = ddi_get_lbolt();
343 * B_DONE allows others to use the buffer, B_DELWRI causes the
344 * buffer to be written before being reused, and setting b_resid
345 * to zero says the buffer is complete.
347 bp->b_flags |= B_DELWRI | B_DONE;
348 bp->b_resid = 0;
349 brelse(bp);
353 * Release the buffer, start I/O on it, but don't wait for completion.
355 void
356 bawrite(struct buf *bp)
358 ASSERT(SEMA_HELD(&bp->b_sem));
360 /* Use bfreelist.b_bcount as a weird-ass heuristic */
361 if (bfreelist.b_bcount > 4)
362 bp->b_flags |= B_ASYNC;
363 bwrite(bp);
367 * Release the buffer, with no I/O implied.
369 void
370 brelse(struct buf *bp)
372 struct buf **backp;
373 uint_t index;
374 kmutex_t *hmp;
375 struct buf *dp;
376 struct hbuf *hp;
379 ASSERT(SEMA_HELD(&bp->b_sem));
382 * Clear the retry write flag if the buffer was written without
383 * error. The presence of B_DELWRI means the buffer has not yet
384 * been written and the presence of B_ERROR means that an error
385 * is still occurring.
387 if ((bp->b_flags & (B_ERROR | B_DELWRI | B_RETRYWRI)) == B_RETRYWRI) {
388 bp->b_flags &= ~B_RETRYWRI;
391 /* Check for anomalous conditions */
392 if (bp->b_flags & (B_ERROR|B_NOCACHE)) {
393 if (bp->b_flags & B_NOCACHE) {
394 /* Don't add to the freelist. Destroy it now */
395 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
396 sema_destroy(&bp->b_sem);
397 sema_destroy(&bp->b_io);
398 kmem_free(bp, sizeof (struct buf));
399 return;
402 * If a write failed and we are supposed to retry write,
403 * don't toss the buffer. Keep it around and mark it
404 * delayed write in the hopes that it will eventually
405 * get flushed (and still keep the system running.)
407 if ((bp->b_flags & (B_READ | B_RETRYWRI)) == B_RETRYWRI) {
408 bp->b_flags |= B_DELWRI;
409 /* keep fsflush from trying continuously to flush */
410 bp->b_start = ddi_get_lbolt();
411 } else
412 bp->b_flags |= B_AGE|B_STALE;
413 bp->b_flags &= ~B_ERROR;
414 bp->b_error = 0;
418 * If delayed write is set then put in on the delayed
419 * write list instead of the free buffer list.
421 index = bio_bhash(bp->b_edev, bp->b_blkno);
422 hmp = &hbuf[index].b_lock;
424 mutex_enter(hmp);
425 hp = &hbuf[index];
426 dp = (struct buf *)hp;
429 * Make sure that the number of entries on this list are
430 * Zero <= count <= total # buffers
432 ASSERT(hp->b_length >= 0);
433 ASSERT(hp->b_length < nbuf);
435 hp->b_length++; /* We are adding this buffer */
437 if (bp->b_flags & B_DELWRI) {
439 * This buffer goes on the delayed write buffer list
441 dp = (struct buf *)&dwbuf[index];
443 ASSERT(bp->b_bufsize > 0);
444 ASSERT(bp->b_bcount > 0);
445 ASSERT(bp->b_un.b_addr != NULL);
447 if (bp->b_flags & B_AGE) {
448 backp = &dp->av_forw;
449 (*backp)->av_back = bp;
450 bp->av_forw = *backp;
451 *backp = bp;
452 bp->av_back = dp;
453 } else {
454 backp = &dp->av_back;
455 (*backp)->av_forw = bp;
456 bp->av_back = *backp;
457 *backp = bp;
458 bp->av_forw = dp;
460 mutex_exit(hmp);
462 if (bfreelist.b_flags & B_WANTED) {
464 * Should come here very very rarely.
466 mutex_enter(&bfree_lock);
467 if (bfreelist.b_flags & B_WANTED) {
468 bfreelist.b_flags &= ~B_WANTED;
469 cv_broadcast(&bio_mem_cv);
471 mutex_exit(&bfree_lock);
474 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC);
476 * Don't let anyone get the buffer off the freelist before we
477 * release our hold on it.
479 sema_v(&bp->b_sem);
483 * Return a count of the number of B_BUSY buffers in the system
484 * Can only be used as a good estimate. If 'cleanit' is set,
485 * try to flush all bufs.
488 bio_busy(int cleanit)
490 struct buf *bp, *dp;
491 int busy = 0;
492 int i;
493 kmutex_t *hmp;
495 for (i = 0; i < v.v_hbuf; i++) {
496 dp = (struct buf *)&hbuf[i];
497 hmp = &hbuf[i].b_lock;
499 mutex_enter(hmp);
500 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
501 if (bp->b_flags & B_BUSY)
502 busy++;
504 mutex_exit(hmp);
507 if (cleanit && busy != 0) {
508 bflush(NODEV);
511 return (busy);
515 * Assign a buffer for the given block. If the appropriate
516 * block is already associated, return it; otherwise search
517 * for the oldest non-busy buffer and reassign it.
519 struct buf *
520 getblk_common(void * arg, dev_t dev, daddr_t blkno, long bsize, bool errflg)
522 ufsvfs_t *ufsvfsp = (struct ufsvfs *)arg;
523 struct buf *bp;
524 struct buf *dp;
525 struct buf *nbp = NULL;
526 struct buf *errbp;
527 uint_t index;
528 kmutex_t *hmp;
529 struct hbuf *hp;
531 if (getmajor(dev) >= devcnt)
532 cmn_err(CE_PANIC, "blkdev");
534 biostats.bio_lookup.value.ui32++;
536 index = bio_bhash(dev, blkno);
537 hp = &hbuf[index];
538 dp = (struct buf *)hp;
539 hmp = &hp->b_lock;
541 mutex_enter(hmp);
542 loop:
543 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
544 if (bp->b_blkno != blkno || bp->b_edev != dev ||
545 (bp->b_flags & B_STALE))
546 continue;
548 * Avoid holding the hash lock in the event that
549 * the buffer is locked by someone. Since the hash chain
550 * may change when we drop the hash lock
551 * we have to start at the beginning of the chain if the
552 * buffer identity/contents aren't valid.
554 if (!sema_tryp(&bp->b_sem)) {
555 biostats.bio_bufbusy.value.ui32++;
556 mutex_exit(hmp);
558 * OK, we are dealing with a busy buffer.
559 * In the case that we are panicking and we
560 * got called from bread(), we have some chance
561 * for error recovery. So better bail out from
562 * here since sema_p() won't block. If we got
563 * called directly from ufs routines, there is
564 * no way to report an error yet.
566 if (panicstr && errflg)
567 goto errout;
569 * For the following line of code to work
570 * correctly never kmem_free the buffer "header".
572 sema_p(&bp->b_sem);
573 if (bp->b_blkno != blkno || bp->b_edev != dev ||
574 (bp->b_flags & B_STALE)) {
575 sema_v(&bp->b_sem);
576 mutex_enter(hmp);
577 goto loop; /* start over */
579 mutex_enter(hmp);
581 /* Found */
582 biostats.bio_hit.value.ui32++;
583 bp->b_flags &= ~B_AGE;
586 * Yank it off the free/delayed write lists
588 hp->b_length--;
589 notavail(bp);
590 mutex_exit(hmp);
592 ASSERT((bp->b_flags & B_NOCACHE) == 0);
594 if (nbp == NULL) {
596 * Make the common path short.
598 ASSERT(SEMA_HELD(&bp->b_sem));
599 return (bp);
602 biostats.bio_bufdup.value.ui32++;
605 * The buffer must have entered during the lock upgrade
606 * so free the new buffer we allocated and return the
607 * found buffer.
609 kmem_free(nbp->b_un.b_addr, nbp->b_bufsize);
610 nbp->b_un.b_addr = NULL;
613 * Account for the memory
615 mutex_enter(&bfree_lock);
616 bfreelist.b_bufsize += nbp->b_bufsize;
617 mutex_exit(&bfree_lock);
620 * Destroy buf identity, and place on avail list
622 nbp->b_dev = (o_dev_t)NODEV;
623 nbp->b_edev = NODEV;
624 nbp->b_flags = 0;
625 nbp->b_file = NULL;
626 nbp->b_offset = -1;
628 sema_v(&nbp->b_sem);
629 bio_bhdr_free(nbp);
631 ASSERT(SEMA_HELD(&bp->b_sem));
632 return (bp);
636 * bio_getfreeblk may block so check the hash chain again.
638 if (nbp == NULL) {
639 mutex_exit(hmp);
640 nbp = bio_getfreeblk(bsize);
641 mutex_enter(hmp);
642 goto loop;
646 * New buffer. Assign nbp and stick it on the hash.
648 nbp->b_flags = B_BUSY;
649 nbp->b_edev = dev;
650 nbp->b_dev = (o_dev_t)cmpdev(dev);
651 nbp->b_blkno = blkno;
652 nbp->b_iodone = NULL;
653 nbp->b_bcount = bsize;
655 * If we are given a ufsvfsp and the vfs_root field is NULL
656 * then this must be I/O for a superblock. A superblock's
657 * buffer is set up in mountfs() and there is no root vnode
658 * at that point.
660 if (ufsvfsp && ufsvfsp->vfs_root) {
661 nbp->b_vp = ufsvfsp->vfs_root;
662 } else {
663 nbp->b_vp = NULL;
666 ASSERT((nbp->b_flags & B_NOCACHE) == 0);
668 binshash(nbp, dp);
669 mutex_exit(hmp);
671 ASSERT(SEMA_HELD(&nbp->b_sem));
673 return (nbp);
677 * Come here in case of an internal error. At this point we couldn't
678 * get a buffer, but we have to return one. Hence we allocate some
679 * kind of error reply buffer on the fly. This buffer is marked as
680 * B_NOCACHE | B_AGE | B_ERROR | B_DONE to assure the following:
681 * - B_ERROR will indicate error to the caller.
682 * - B_DONE will prevent us from reading the buffer from
683 * the device.
684 * - B_NOCACHE will cause that this buffer gets free'd in
685 * brelse().
688 errout:
689 errbp = geteblk();
690 sema_p(&errbp->b_sem);
691 errbp->b_flags &= ~B_BUSY;
692 errbp->b_flags |= (B_ERROR | B_DONE);
693 return (errbp);
697 * Get an empty block, not assigned to any particular device.
698 * Returns a locked buffer that is not on any hash or free list.
700 struct buf *
701 ngeteblk(long bsize)
703 struct buf *bp;
705 bp = kmem_alloc(sizeof (struct buf), KM_SLEEP);
706 bioinit(bp);
707 bp->av_forw = bp->av_back = NULL;
708 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
709 bp->b_bufsize = bsize;
710 bp->b_flags = B_BUSY | B_NOCACHE | B_AGE;
711 bp->b_dev = (o_dev_t)NODEV;
712 bp->b_edev = NODEV;
713 bp->b_lblkno = 0;
714 bp->b_bcount = bsize;
715 bp->b_iodone = NULL;
716 return (bp);
720 * Interface of geteblk() is kept intact to maintain driver compatibility.
721 * Use ngeteblk() to allocate block size other than 1 KB.
723 struct buf *
724 geteblk(void)
726 return (ngeteblk((long)1024));
730 * Return a buffer w/o sleeping
732 struct buf *
733 trygetblk(dev_t dev, daddr_t blkno)
735 struct buf *bp;
736 struct buf *dp;
737 struct hbuf *hp;
738 kmutex_t *hmp;
739 uint_t index;
741 index = bio_bhash(dev, blkno);
742 hp = &hbuf[index];
743 hmp = &hp->b_lock;
745 if (!mutex_tryenter(hmp))
746 return (NULL);
748 dp = (struct buf *)hp;
749 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
750 if (bp->b_blkno != blkno || bp->b_edev != dev ||
751 (bp->b_flags & B_STALE))
752 continue;
754 * Get access to a valid buffer without sleeping
756 if (sema_tryp(&bp->b_sem)) {
757 if (bp->b_flags & B_DONE) {
758 hp->b_length--;
759 notavail(bp);
760 mutex_exit(hmp);
761 return (bp);
762 } else {
763 sema_v(&bp->b_sem);
764 break;
767 break;
769 mutex_exit(hmp);
770 return (NULL);
774 * Wait for I/O completion on the buffer; return errors
775 * to the user.
778 iowait(struct buf *bp)
780 ASSERT(SEMA_HELD(&bp->b_sem));
781 return (biowait(bp));
785 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
786 * and wake up anyone waiting for it.
788 void
789 iodone(struct buf *bp)
791 ASSERT(SEMA_HELD(&bp->b_sem));
792 (void) biodone(bp);
796 * Zero the core associated with a buffer.
798 void
799 clrbuf(struct buf *bp)
801 ASSERT(SEMA_HELD(&bp->b_sem));
802 bzero(bp->b_un.b_addr, bp->b_bcount);
803 bp->b_resid = 0;
808 * Make sure all write-behind blocks on dev (or NODEV for all)
809 * are flushed out.
811 void
812 bflush(dev_t dev)
814 struct buf *bp, *dp;
815 struct hbuf *hp;
816 struct buf *delwri_list = EMPTY_LIST;
817 int i, index;
818 kmutex_t *hmp;
820 mutex_enter(&blist_lock);
822 * Wait for any invalidates or flushes ahead of us to finish.
823 * We really could split blist_lock up per device for better
824 * parallelism here.
826 while (bio_doinginval || bio_doingflush) {
827 bio_flinv_cv_wanted = 1;
828 cv_wait(&bio_flushinval_cv, &blist_lock);
830 bio_doingflush++;
832 * Gather all B_DELWRI buffer for device.
833 * Lock ordering is b_sem > hash lock (brelse).
834 * Since we are finding the buffer via the delayed write list,
835 * it may be busy and we would block trying to get the
836 * b_sem lock while holding hash lock. So transfer all the
837 * candidates on the delwri_list and then drop the hash locks.
839 for (i = 0; i < v.v_hbuf; i++) {
840 hmp = &hbuf[i].b_lock;
841 dp = (struct buf *)&dwbuf[i];
842 mutex_enter(hmp);
843 for (bp = dp->av_forw; bp != dp; bp = bp->av_forw) {
844 if (dev == NODEV || bp->b_edev == dev) {
845 if (bp->b_list == NULL) {
846 bp->b_list = delwri_list;
847 delwri_list = bp;
851 mutex_exit(hmp);
853 mutex_exit(&blist_lock);
856 * Now that the hash locks have been dropped grab the semaphores
857 * and write back all the buffers that have B_DELWRI set.
859 while (delwri_list != EMPTY_LIST) {
860 bp = delwri_list;
862 sema_p(&bp->b_sem); /* may block */
863 if ((dev != bp->b_edev && dev != NODEV) ||
864 (panicstr && bp->b_flags & B_BUSY)) {
865 sema_v(&bp->b_sem);
866 delwri_list = bp->b_list;
867 bp->b_list = NULL;
868 continue; /* No longer a candidate */
870 if (bp->b_flags & B_DELWRI) {
871 index = bio_bhash(bp->b_edev, bp->b_blkno);
872 hp = &hbuf[index];
873 hmp = &hp->b_lock;
874 dp = (struct buf *)hp;
876 bp->b_flags |= B_ASYNC;
877 mutex_enter(hmp);
878 hp->b_length--;
879 notavail(bp);
880 mutex_exit(hmp);
881 if (bp->b_vp == NULL) { /* !ufs */
882 bwrite(bp);
883 } else { /* ufs */
884 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
886 } else {
887 sema_v(&bp->b_sem);
889 delwri_list = bp->b_list;
890 bp->b_list = NULL;
892 mutex_enter(&blist_lock);
893 bio_doingflush--;
894 if (bio_flinv_cv_wanted) {
895 bio_flinv_cv_wanted = 0;
896 cv_broadcast(&bio_flushinval_cv);
898 mutex_exit(&blist_lock);
902 * Ensure that a specified block is up-to-date on disk.
904 void
905 blkflush(dev_t dev, daddr_t blkno)
907 struct buf *bp, *dp;
908 struct hbuf *hp;
909 struct buf *sbp = NULL;
910 uint_t index;
911 kmutex_t *hmp;
913 index = bio_bhash(dev, blkno);
914 hp = &hbuf[index];
915 dp = (struct buf *)hp;
916 hmp = &hp->b_lock;
919 * Identify the buffer in the cache belonging to
920 * this device and blkno (if any).
922 mutex_enter(hmp);
923 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
924 if (bp->b_blkno != blkno || bp->b_edev != dev ||
925 (bp->b_flags & B_STALE))
926 continue;
927 sbp = bp;
928 break;
930 mutex_exit(hmp);
931 if (sbp == NULL)
932 return;
934 * Now check the buffer we have identified and
935 * make sure it still belongs to the device and is B_DELWRI
937 sema_p(&sbp->b_sem);
938 if (sbp->b_blkno == blkno && sbp->b_edev == dev &&
939 (sbp->b_flags & (B_DELWRI|B_STALE)) == B_DELWRI) {
940 mutex_enter(hmp);
941 hp->b_length--;
942 notavail(sbp);
943 mutex_exit(hmp);
945 * XXX - There is nothing to guarantee a synchronous
946 * write here if the B_ASYNC flag is set. This needs
947 * some investigation.
949 if (sbp->b_vp == NULL) { /* !ufs */
950 bwrite(sbp); /* synchronous write */
951 } else { /* ufs */
952 UFS_BWRITE(VTOI(sbp->b_vp)->i_ufsvfs, sbp);
954 } else {
955 sema_v(&sbp->b_sem);
960 * Same as binval, except can force-invalidate delayed-write buffers
961 * (which are not be already flushed because of device errors). Also
962 * makes sure that the retry write flag is cleared.
965 bfinval(dev_t dev, int force)
967 struct buf *dp;
968 struct buf *bp;
969 struct buf *binval_list = EMPTY_LIST;
970 int i, error = 0;
971 kmutex_t *hmp;
972 uint_t index;
973 struct buf **backp;
975 mutex_enter(&blist_lock);
977 * Wait for any flushes ahead of us to finish, it's ok to
978 * do invalidates in parallel.
980 while (bio_doingflush) {
981 bio_flinv_cv_wanted = 1;
982 cv_wait(&bio_flushinval_cv, &blist_lock);
984 bio_doinginval++;
986 /* Gather bp's */
987 for (i = 0; i < v.v_hbuf; i++) {
988 dp = (struct buf *)&hbuf[i];
989 hmp = &hbuf[i].b_lock;
991 mutex_enter(hmp);
992 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
993 if (bp->b_edev == dev) {
994 if (bp->b_list == NULL) {
995 bp->b_list = binval_list;
996 binval_list = bp;
1000 mutex_exit(hmp);
1002 mutex_exit(&blist_lock);
1004 /* Invalidate all bp's found */
1005 while (binval_list != EMPTY_LIST) {
1006 bp = binval_list;
1008 sema_p(&bp->b_sem);
1009 if (bp->b_edev == dev) {
1010 if (force && (bp->b_flags & B_DELWRI)) {
1011 /* clear B_DELWRI, move to non-dw freelist */
1012 index = bio_bhash(bp->b_edev, bp->b_blkno);
1013 hmp = &hbuf[index].b_lock;
1014 dp = (struct buf *)&hbuf[index];
1015 mutex_enter(hmp);
1017 /* remove from delayed write freelist */
1018 notavail(bp);
1020 /* add to B_AGE side of non-dw freelist */
1021 backp = &dp->av_forw;
1022 (*backp)->av_back = bp;
1023 bp->av_forw = *backp;
1024 *backp = bp;
1025 bp->av_back = dp;
1028 * make sure write retries and busy are cleared
1030 bp->b_flags &=
1031 ~(B_BUSY | B_DELWRI | B_RETRYWRI);
1032 mutex_exit(hmp);
1034 if ((bp->b_flags & B_DELWRI) == 0)
1035 bp->b_flags |= B_STALE|B_AGE;
1036 else
1037 error = EIO;
1039 sema_v(&bp->b_sem);
1040 binval_list = bp->b_list;
1041 bp->b_list = NULL;
1043 mutex_enter(&blist_lock);
1044 bio_doinginval--;
1045 if (bio_flinv_cv_wanted) {
1046 cv_broadcast(&bio_flushinval_cv);
1047 bio_flinv_cv_wanted = 0;
1049 mutex_exit(&blist_lock);
1050 return (error);
1054 * If possible, invalidate blocks for a dev on demand
1056 void
1057 binval(dev_t dev)
1059 (void) bfinval(dev, 0);
1063 * Initialize the buffer I/O system by freeing
1064 * all buffers and setting all device hash buffer lists to empty.
1066 void
1067 binit(void)
1069 struct buf *bp;
1070 unsigned int i, pct;
1071 ulong_t bio_max_hwm, bio_default_hwm;
1074 * Maximum/Default values for bufhwm are set to the smallest of:
1075 * - BIO_MAX_PERCENT resp. BIO_BUF_PERCENT of real memory
1076 * - 1/4 of kernel virtual memory
1077 * - INT32_MAX to prevent overflows of v.v_bufhwm (which is int).
1078 * Additionally, in order to allow simple tuning by percentage of
1079 * physical memory, bufhwm_pct is used to calculate the default if
1080 * the value of this tunable is between 0 and BIO_MAX_PERCENT.
1082 * Since the unit for v.v_bufhwm is kilobytes, this allows for
1083 * a maximum of 1024 * 2GB == 2TB memory usage by buffer headers.
1085 bio_max_hwm = MIN(physmem / BIO_MAX_PERCENT,
1086 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1087 bio_max_hwm = MIN(INT32_MAX, bio_max_hwm);
1089 pct = BIO_BUF_PERCENT;
1090 if (bufhwm_pct != 0 &&
1091 ((pct = 100 / bufhwm_pct) < BIO_MAX_PERCENT)) {
1092 pct = BIO_BUF_PERCENT;
1094 * Invalid user specified value, emit a warning.
1096 cmn_err(CE_WARN, "binit: bufhwm_pct(%d) out of \
1097 range(1..%d). Using %d as default.",
1098 bufhwm_pct,
1099 100 / BIO_MAX_PERCENT, 100 / BIO_BUF_PERCENT);
1102 bio_default_hwm = MIN(physmem / pct,
1103 btop(vmem_size(heap_arena, VMEM_FREE)) / 4) * (PAGESIZE / 1024);
1104 bio_default_hwm = MIN(INT32_MAX, bio_default_hwm);
1106 if ((v.v_bufhwm = bufhwm) == 0)
1107 v.v_bufhwm = bio_default_hwm;
1109 if (v.v_bufhwm < BIO_MIN_HWM || v.v_bufhwm > bio_max_hwm) {
1110 v.v_bufhwm = (int)bio_max_hwm;
1112 * Invalid user specified value, emit a warning.
1114 cmn_err(CE_WARN,
1115 "binit: bufhwm(%d) out \
1116 of range(%d..%lu). Using %lu as default",
1117 bufhwm,
1118 BIO_MIN_HWM, bio_max_hwm, bio_max_hwm);
1122 * Determine the number of hash buckets. Default is to
1123 * create ~BIO_HASHLEN entries per chain based on MAXBSIZE buffers.
1124 * Round up number to the next power of 2.
1126 v.v_hbuf = 1 << highbit((((ulong_t)v.v_bufhwm * 1024) / MAXBSIZE) /
1127 BIO_HASHLEN);
1128 v.v_hmask = v.v_hbuf - 1;
1129 v.v_buf = BIO_BHDR_POOL;
1131 hbuf = kmem_zalloc(v.v_hbuf * sizeof (struct hbuf), KM_SLEEP);
1133 dwbuf = kmem_zalloc(v.v_hbuf * sizeof (struct dwbuf), KM_SLEEP);
1135 bfreelist.b_bufsize = (size_t)v.v_bufhwm * 1024;
1136 bp = &bfreelist;
1137 bp->b_forw = bp->b_back = bp->av_forw = bp->av_back = bp;
1139 for (i = 0; i < v.v_hbuf; i++) {
1140 hbuf[i].b_forw = hbuf[i].b_back = (struct buf *)&hbuf[i];
1141 hbuf[i].av_forw = hbuf[i].av_back = (struct buf *)&hbuf[i];
1144 * Initialize the delayed write buffer list.
1146 dwbuf[i].b_forw = dwbuf[i].b_back = (struct buf *)&dwbuf[i];
1147 dwbuf[i].av_forw = dwbuf[i].av_back = (struct buf *)&dwbuf[i];
1152 * Wait for I/O completion on the buffer; return error code.
1153 * If bp was for synchronous I/O, bp is invalid and associated
1154 * resources are freed on return.
1157 biowait(struct buf *bp)
1159 int error = 0;
1160 struct cpu *cpup;
1162 ASSERT(SEMA_HELD(&bp->b_sem));
1164 cpup = CPU;
1165 atomic_inc_64(&cpup->cpu_stats.sys.iowait);
1166 DTRACE_IO1(wait__start, struct buf *, bp);
1169 * In case of panic, busy wait for completion
1171 if (panicstr) {
1172 while ((bp->b_flags & B_DONE) == 0)
1173 drv_usecwait(10);
1174 } else
1175 sema_p(&bp->b_io);
1177 DTRACE_IO1(wait__done, struct buf *, bp);
1178 atomic_dec_64(&cpup->cpu_stats.sys.iowait);
1180 error = geterror(bp);
1181 if ((bp->b_flags & B_ASYNC) == 0) {
1182 if (bp->b_flags & B_REMAPPED)
1183 bp_mapout(bp);
1185 return (error);
1189 * Mark I/O complete on a buffer, release it if I/O is asynchronous,
1190 * and wake up anyone waiting for it.
1192 void
1193 biodone(struct buf *bp)
1195 if (bp->b_flags & B_STARTED) {
1196 DTRACE_IO1(done, struct buf *, bp);
1197 bp->b_flags &= ~B_STARTED;
1200 if (bp->b_iodone != NULL) {
1201 (*(bp->b_iodone))(bp);
1202 return;
1204 ASSERT((bp->b_flags & B_DONE) == 0);
1205 ASSERT(SEMA_HELD(&bp->b_sem));
1206 bp->b_flags |= B_DONE;
1207 if (bp->b_flags & B_ASYNC) {
1208 if (bp->b_flags & (B_PAGEIO|B_REMAPPED))
1209 bio_pageio_done(bp);
1210 else
1211 brelse(bp); /* release bp to freelist */
1212 } else {
1213 sema_v(&bp->b_io);
1218 * Pick up the device's error number and pass it to the user;
1219 * if there is an error but the number is 0 set a generalized code.
1222 geterror(struct buf *bp)
1224 int error = 0;
1226 ASSERT(SEMA_HELD(&bp->b_sem));
1227 if (bp->b_flags & B_ERROR) {
1228 error = bp->b_error;
1229 if (!error)
1230 error = EIO;
1232 return (error);
1236 * Support for pageio buffers.
1238 * This stuff should be generalized to provide a generalized bp
1239 * header facility that can be used for things other than pageio.
1243 * Allocate and initialize a buf struct for use with pageio.
1245 struct buf *
1246 pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags)
1248 struct buf *bp;
1249 struct cpu *cpup;
1251 if (flags & B_READ) {
1252 CPU_STATS_ENTER_K();
1253 cpup = CPU; /* get pointer AFTER preemption is disabled */
1254 CPU_STATS_ADDQ(cpup, vm, pgin, 1);
1255 CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len));
1257 atomic_add_64(&curzone->zone_pgpgin, btopr(len));
1259 if ((flags & B_ASYNC) == 0) {
1260 klwp_t *lwp = ttolwp(curthread);
1261 if (lwp != NULL)
1262 lwp->lwp_ru.majflt++;
1263 CPU_STATS_ADDQ(cpup, vm, maj_fault, 1);
1266 * Update statistics for pages being paged in
1268 if (pp != NULL && pp->p_vnode != NULL) {
1269 if (IS_SWAPFSVP(pp->p_vnode)) {
1270 CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len));
1271 atomic_add_64(&curzone->zone_anonpgin,
1272 btopr(len));
1273 } else {
1274 if (pp->p_vnode->v_flag & VVMEXEC) {
1275 CPU_STATS_ADDQ(cpup, vm, execpgin,
1276 btopr(len));
1277 atomic_add_64(&curzone->zone_execpgin,
1278 btopr(len));
1279 } else {
1280 CPU_STATS_ADDQ(cpup, vm, fspgin,
1281 btopr(len));
1282 atomic_add_64(&curzone->zone_fspgin,
1283 btopr(len));
1287 CPU_STATS_EXIT_K();
1288 TRACE_1(TR_FAC_VM, TR_PAGE_WS_IN,
1289 "page_ws_in:pp %p", pp);
1292 bp = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
1293 bp->b_bcount = len;
1294 bp->b_bufsize = len;
1295 bp->b_pages = pp;
1296 bp->b_flags = B_PAGEIO | B_NOCACHE | B_BUSY | flags;
1297 bp->b_offset = -1;
1298 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1300 /* Initialize bp->b_sem in "locked" state */
1301 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1303 VN_HOLD(vp);
1304 bp->b_vp = vp;
1305 THREAD_KPRI_RELEASE_N(btopr(len)); /* release kpri from page_locks */
1308 * Caller sets dev & blkno and can adjust
1309 * b_addr for page offset and can use bp_mapin
1310 * to make pages kernel addressable.
1312 return (bp);
1315 void
1316 pageio_done(struct buf *bp)
1318 ASSERT(SEMA_HELD(&bp->b_sem));
1319 if (bp->b_flags & B_REMAPPED)
1320 bp_mapout(bp);
1321 VN_RELE(bp->b_vp);
1322 bp->b_vp = NULL;
1323 ASSERT((bp->b_flags & B_NOCACHE) != 0);
1325 /* A sema_v(bp->b_sem) is implied if we are destroying it */
1326 sema_destroy(&bp->b_sem);
1327 sema_destroy(&bp->b_io);
1328 kmem_free(bp, sizeof (struct buf));
1332 * Check to see whether the buffers, except the one pointed by sbp,
1333 * associated with the device are busy.
1334 * NOTE: This expensive operation shall be improved together with ufs_icheck().
1337 bcheck(dev_t dev, struct buf *sbp)
1339 struct buf *bp;
1340 struct buf *dp;
1341 int i;
1342 kmutex_t *hmp;
1345 * check for busy bufs for this filesystem
1347 for (i = 0; i < v.v_hbuf; i++) {
1348 dp = (struct buf *)&hbuf[i];
1349 hmp = &hbuf[i].b_lock;
1351 mutex_enter(hmp);
1352 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1354 * if buf is busy or dirty, then filesystem is busy
1356 if ((bp->b_edev == dev) &&
1357 ((bp->b_flags & B_STALE) == 0) &&
1358 (bp->b_flags & (B_DELWRI|B_BUSY)) &&
1359 (bp != sbp)) {
1360 mutex_exit(hmp);
1361 return (1);
1364 mutex_exit(hmp);
1366 return (0);
1370 * Hash two 32 bit entities.
1373 hash2ints(int x, int y)
1375 int hash = 0;
1377 hash = x - 1;
1378 hash = ((hash * 7) + (x >> 8)) - 1;
1379 hash = ((hash * 7) + (x >> 16)) - 1;
1380 hash = ((hash * 7) + (x >> 24)) - 1;
1381 hash = ((hash * 7) + y) - 1;
1382 hash = ((hash * 7) + (y >> 8)) - 1;
1383 hash = ((hash * 7) + (y >> 16)) - 1;
1384 hash = ((hash * 7) + (y >> 24)) - 1;
1386 return (hash);
1391 * Return a new buffer struct.
1392 * Create a new buffer if we haven't gone over our high water
1393 * mark for memory, otherwise try to get one off the freelist.
1395 * Returns a locked buf that has no id and is not on any hash or free
1396 * list.
1398 static struct buf *
1399 bio_getfreeblk(long bsize)
1401 struct buf *bp, *dp;
1402 struct hbuf *hp;
1403 kmutex_t *hmp;
1404 uint_t start, end;
1407 * mutex_enter(&bfree_lock);
1408 * bfreelist.b_bufsize represents the amount of memory
1409 * mutex_exit(&bfree_lock); protect ref to bfreelist
1410 * we are allowed to allocate in the cache before we hit our hwm.
1412 bio_mem_get(bsize); /* Account for our memory request */
1414 again:
1415 bp = bio_bhdr_alloc(); /* Get a buf hdr */
1416 sema_p(&bp->b_sem); /* Should never fail */
1418 ASSERT(bp->b_un.b_addr == NULL);
1419 bp->b_un.b_addr = kmem_alloc(bsize, KM_NOSLEEP);
1420 if (bp->b_un.b_addr != NULL) {
1422 * Make the common path short
1424 bp->b_bufsize = bsize;
1425 ASSERT(SEMA_HELD(&bp->b_sem));
1426 return (bp);
1427 } else {
1428 struct buf *save;
1430 save = bp; /* Save bp we allocated */
1431 start = end = lastindex;
1433 biostats.bio_bufwant.value.ui32++;
1436 * Memory isn't available from the system now. Scan
1437 * the hash buckets till enough space is found.
1439 do {
1440 hp = &hbuf[start];
1441 hmp = &hp->b_lock;
1442 dp = (struct buf *)hp;
1444 mutex_enter(hmp);
1445 bp = dp->av_forw;
1447 while (bp != dp) {
1449 ASSERT(bp != NULL);
1451 if (!sema_tryp(&bp->b_sem)) {
1452 bp = bp->av_forw;
1453 continue;
1457 * Since we are going down the freelist
1458 * associated with this hash bucket the
1459 * B_DELWRI flag should not be set.
1461 ASSERT(!(bp->b_flags & B_DELWRI));
1463 if (bp->b_bufsize == bsize) {
1464 hp->b_length--;
1465 notavail(bp);
1466 bremhash(bp);
1467 mutex_exit(hmp);
1470 * Didn't kmem_alloc any more, so don't
1471 * count it twice.
1473 mutex_enter(&bfree_lock);
1474 bfreelist.b_bufsize += bsize;
1475 mutex_exit(&bfree_lock);
1478 * Update the lastindex value.
1480 lastindex = start;
1483 * Put our saved bp back on the list
1485 sema_v(&save->b_sem);
1486 bio_bhdr_free(save);
1487 ASSERT(SEMA_HELD(&bp->b_sem));
1488 return (bp);
1490 sema_v(&bp->b_sem);
1491 bp = bp->av_forw;
1493 mutex_exit(hmp);
1494 start = ((start + 1) % v.v_hbuf);
1495 } while (start != end);
1497 biostats.bio_bufwait.value.ui32++;
1498 bp = save; /* Use original bp */
1499 bp->b_un.b_addr = kmem_alloc(bsize, KM_SLEEP);
1502 bp->b_bufsize = bsize;
1503 ASSERT(SEMA_HELD(&bp->b_sem));
1504 return (bp);
1508 * Allocate a buffer header. If none currently available, allocate
1509 * a new pool.
1511 static struct buf *
1512 bio_bhdr_alloc(void)
1514 struct buf *dp, *sdp;
1515 struct buf *bp;
1516 int i;
1518 for (;;) {
1519 mutex_enter(&bhdr_lock);
1520 if (bhdrlist != NULL) {
1521 bp = bhdrlist;
1522 bhdrlist = bp->av_forw;
1523 mutex_exit(&bhdr_lock);
1524 bp->av_forw = NULL;
1525 return (bp);
1527 mutex_exit(&bhdr_lock);
1530 * Need to allocate a new pool. If the system is currently
1531 * out of memory, then try freeing things on the freelist.
1533 dp = kmem_zalloc(sizeof (struct buf) * v.v_buf, KM_NOSLEEP);
1534 if (dp == NULL) {
1536 * System can't give us a pool of headers, try
1537 * recycling from the free lists.
1539 bio_recycle(BIO_HEADER, 0);
1540 } else {
1541 sdp = dp;
1542 for (i = 0; i < v.v_buf; i++, dp++) {
1544 * The next two lines are needed since NODEV
1545 * is -1 and not NULL
1547 dp->b_dev = (o_dev_t)NODEV;
1548 dp->b_edev = NODEV;
1549 dp->av_forw = dp + 1;
1550 sema_init(&dp->b_sem, 1, NULL, SEMA_DEFAULT,
1551 NULL);
1552 sema_init(&dp->b_io, 0, NULL, SEMA_DEFAULT,
1553 NULL);
1554 dp->b_offset = -1;
1556 mutex_enter(&bhdr_lock);
1557 (--dp)->av_forw = bhdrlist; /* Fix last pointer */
1558 bhdrlist = sdp;
1559 nbuf += v.v_buf;
1560 bp = bhdrlist;
1561 bhdrlist = bp->av_forw;
1562 mutex_exit(&bhdr_lock);
1564 bp->av_forw = NULL;
1565 return (bp);
1570 static void
1571 bio_bhdr_free(struct buf *bp)
1573 ASSERT(bp->b_back == NULL);
1574 ASSERT(bp->b_forw == NULL);
1575 ASSERT(bp->av_back == NULL);
1576 ASSERT(bp->av_forw == NULL);
1577 ASSERT(bp->b_un.b_addr == NULL);
1578 ASSERT(bp->b_dev == (o_dev_t)NODEV);
1579 ASSERT(bp->b_edev == NODEV);
1580 ASSERT(bp->b_flags == 0);
1582 mutex_enter(&bhdr_lock);
1583 bp->av_forw = bhdrlist;
1584 bhdrlist = bp;
1585 mutex_exit(&bhdr_lock);
1589 * If we haven't gone over the high water mark, it's o.k. to
1590 * allocate more buffer space, otherwise recycle buffers
1591 * from the freelist until enough memory is free for a bsize request.
1593 * We account for this memory, even though
1594 * we don't allocate it here.
1596 static void
1597 bio_mem_get(long bsize)
1599 mutex_enter(&bfree_lock);
1600 if (bfreelist.b_bufsize > bsize) {
1601 bfreelist.b_bufsize -= bsize;
1602 mutex_exit(&bfree_lock);
1603 return;
1605 mutex_exit(&bfree_lock);
1606 bio_recycle(BIO_MEM, bsize);
1610 * flush a list of delayed write buffers.
1611 * (currently used only by bio_recycle below.)
1613 static void
1614 bio_flushlist(struct buf *delwri_list)
1616 struct buf *bp;
1618 while (delwri_list != EMPTY_LIST) {
1619 bp = delwri_list;
1620 bp->b_flags |= B_AGE | B_ASYNC;
1621 if (bp->b_vp == NULL) { /* !ufs */
1622 bwrite(bp);
1623 } else { /* ufs */
1624 UFS_BWRITE(VTOI(bp->b_vp)->i_ufsvfs, bp);
1626 delwri_list = bp->b_list;
1627 bp->b_list = NULL;
1632 * Start recycling buffers on the freelist for one of 2 reasons:
1633 * - we need a buffer header
1634 * - we need to free up memory
1635 * Once started we continue to recycle buffers until the B_AGE
1636 * buffers are gone.
1638 static void
1639 bio_recycle(int want, long bsize)
1641 struct buf *bp, *dp, *dwp, *nbp;
1642 struct hbuf *hp;
1643 int found = 0;
1644 kmutex_t *hmp;
1645 int start, end;
1646 struct buf *delwri_list = EMPTY_LIST;
1649 * Recycle buffers.
1651 top:
1652 start = end = lastindex;
1653 do {
1654 hp = &hbuf[start];
1655 hmp = &hp->b_lock;
1656 dp = (struct buf *)hp;
1658 mutex_enter(hmp);
1659 bp = dp->av_forw;
1661 while (bp != dp) {
1663 ASSERT(bp != NULL);
1665 if (!sema_tryp(&bp->b_sem)) {
1666 bp = bp->av_forw;
1667 continue;
1670 * Do we really want to nuke all of the B_AGE stuff??
1672 if ((bp->b_flags & B_AGE) == 0 && found) {
1673 sema_v(&bp->b_sem);
1674 mutex_exit(hmp);
1675 lastindex = start;
1676 return; /* All done */
1679 ASSERT(MUTEX_HELD(&hp->b_lock));
1680 ASSERT(!(bp->b_flags & B_DELWRI));
1681 hp->b_length--;
1682 notavail(bp);
1685 * Remove bhdr from cache, free up memory,
1686 * and add the hdr to the freelist.
1688 bremhash(bp);
1689 mutex_exit(hmp);
1691 if (bp->b_bufsize) {
1692 kmem_free(bp->b_un.b_addr, bp->b_bufsize);
1693 bp->b_un.b_addr = NULL;
1694 mutex_enter(&bfree_lock);
1695 bfreelist.b_bufsize += bp->b_bufsize;
1696 mutex_exit(&bfree_lock);
1699 bp->b_dev = (o_dev_t)NODEV;
1700 bp->b_edev = NODEV;
1701 bp->b_flags = 0;
1702 sema_v(&bp->b_sem);
1703 bio_bhdr_free(bp);
1704 if (want == BIO_HEADER) {
1705 found = 1;
1706 } else {
1707 ASSERT(want == BIO_MEM);
1708 if (!found && bfreelist.b_bufsize >= bsize) {
1709 /* Account for the memory we want */
1710 mutex_enter(&bfree_lock);
1711 if (bfreelist.b_bufsize >= bsize) {
1712 bfreelist.b_bufsize -= bsize;
1713 found = 1;
1715 mutex_exit(&bfree_lock);
1720 * Since we dropped hmp start from the
1721 * begining.
1723 mutex_enter(hmp);
1724 bp = dp->av_forw;
1726 mutex_exit(hmp);
1729 * Look at the delayed write list.
1730 * First gather into a private list, then write them.
1732 dwp = (struct buf *)&dwbuf[start];
1733 mutex_enter(&blist_lock);
1734 bio_doingflush++;
1735 mutex_enter(hmp);
1736 for (bp = dwp->av_forw; bp != dwp; bp = nbp) {
1738 ASSERT(bp != NULL);
1739 nbp = bp->av_forw;
1741 if (!sema_tryp(&bp->b_sem))
1742 continue;
1743 ASSERT(bp->b_flags & B_DELWRI);
1745 * Do we really want to nuke all of the B_AGE stuff??
1748 if ((bp->b_flags & B_AGE) == 0 && found) {
1749 sema_v(&bp->b_sem);
1750 mutex_exit(hmp);
1751 lastindex = start;
1752 mutex_exit(&blist_lock);
1753 bio_flushlist(delwri_list);
1754 mutex_enter(&blist_lock);
1755 bio_doingflush--;
1756 if (bio_flinv_cv_wanted) {
1757 bio_flinv_cv_wanted = 0;
1758 cv_broadcast(&bio_flushinval_cv);
1760 mutex_exit(&blist_lock);
1761 return; /* All done */
1765 * If the buffer is already on a flush or
1766 * invalidate list then just skip it.
1768 if (bp->b_list != NULL) {
1769 sema_v(&bp->b_sem);
1770 continue;
1773 * We are still on the same bucket.
1775 hp->b_length--;
1776 notavail(bp);
1777 bp->b_list = delwri_list;
1778 delwri_list = bp;
1780 mutex_exit(hmp);
1781 mutex_exit(&blist_lock);
1782 bio_flushlist(delwri_list);
1783 delwri_list = EMPTY_LIST;
1784 mutex_enter(&blist_lock);
1785 bio_doingflush--;
1786 if (bio_flinv_cv_wanted) {
1787 bio_flinv_cv_wanted = 0;
1788 cv_broadcast(&bio_flushinval_cv);
1790 mutex_exit(&blist_lock);
1791 start = (start + 1) % v.v_hbuf;
1793 } while (start != end);
1795 if (found)
1796 return;
1799 * Free lists exhausted and we haven't satisfied the request.
1800 * Wait here for more entries to be added to freelist.
1801 * Because this might have just happened, make it timed.
1803 mutex_enter(&bfree_lock);
1804 bfreelist.b_flags |= B_WANTED;
1805 (void) cv_reltimedwait(&bio_mem_cv, &bfree_lock, hz, TR_CLOCK_TICK);
1806 mutex_exit(&bfree_lock);
1807 goto top;
1811 * See if the block is associated with some buffer
1812 * (mainly to avoid getting hung up on a wait in breada).
1814 static int
1815 bio_incore(dev_t dev, daddr_t blkno)
1817 struct buf *bp;
1818 struct buf *dp;
1819 uint_t index;
1820 kmutex_t *hmp;
1822 index = bio_bhash(dev, blkno);
1823 dp = (struct buf *)&hbuf[index];
1824 hmp = &hbuf[index].b_lock;
1826 mutex_enter(hmp);
1827 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) {
1828 if (bp->b_blkno == blkno && bp->b_edev == dev &&
1829 (bp->b_flags & B_STALE) == 0) {
1830 mutex_exit(hmp);
1831 return (1);
1834 mutex_exit(hmp);
1835 return (0);
1838 static void
1839 bio_pageio_done(struct buf *bp)
1841 if (bp->b_flags & B_PAGEIO) {
1843 if (bp->b_flags & B_REMAPPED)
1844 bp_mapout(bp);
1846 if (bp->b_flags & B_READ)
1847 pvn_read_done(bp->b_pages, bp->b_flags);
1848 else
1849 pvn_write_done(bp->b_pages, B_WRITE | bp->b_flags);
1850 pageio_done(bp);
1851 } else {
1852 ASSERT(bp->b_flags & B_REMAPPED);
1853 bp_mapout(bp);
1854 brelse(bp);
1859 * bioerror(9F) - indicate error in buffer header
1860 * If 'error' is zero, remove the error indication.
1862 void
1863 bioerror(struct buf *bp, int error)
1865 ASSERT(bp != NULL);
1866 ASSERT(error >= 0);
1867 ASSERT(SEMA_HELD(&bp->b_sem));
1869 if (error != 0) {
1870 bp->b_flags |= B_ERROR;
1871 } else {
1872 bp->b_flags &= ~B_ERROR;
1874 bp->b_error = error;
1878 * bioreset(9F) - reuse a private buffer header after I/O is complete
1880 void
1881 bioreset(struct buf *bp)
1883 ASSERT(bp != NULL);
1885 biofini(bp);
1886 bioinit(bp);
1890 * biosize(9F) - return size of a buffer header
1892 size_t
1893 biosize(void)
1895 return (sizeof (struct buf));
1899 * biomodified(9F) - check if buffer is modified
1902 biomodified(struct buf *bp)
1904 int npf;
1905 int ppattr;
1906 struct page *pp;
1908 ASSERT(bp != NULL);
1910 if ((bp->b_flags & B_PAGEIO) == 0) {
1911 return (-1);
1913 pp = bp->b_pages;
1914 npf = btopr(bp->b_bcount + ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET));
1916 while (npf > 0) {
1917 ppattr = hat_pagesync(pp, HAT_SYNC_DONTZERO |
1918 HAT_SYNC_STOPON_MOD);
1919 if (ppattr & P_MOD)
1920 return (1);
1921 pp = pp->p_next;
1922 npf--;
1925 return (0);
1929 * bioinit(9F) - initialize a buffer structure
1931 void
1932 bioinit(struct buf *bp)
1934 bzero(bp, sizeof (struct buf));
1935 sema_init(&bp->b_sem, 0, NULL, SEMA_DEFAULT, NULL);
1936 sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
1937 bp->b_offset = -1;
1941 * biofini(9F) - uninitialize a buffer structure
1943 void
1944 biofini(struct buf *bp)
1946 sema_destroy(&bp->b_io);
1947 sema_destroy(&bp->b_sem);
1951 * bioclone(9F) - clone a buffer
1953 struct buf *
1954 bioclone(struct buf *bp, off_t off, size_t len, dev_t dev, daddr_t blkno,
1955 int (*iodone)(struct buf *), struct buf *bp_mem, int sleep)
1957 struct buf *bufp;
1959 ASSERT(bp);
1960 if (bp_mem == NULL) {
1961 bufp = kmem_alloc(sizeof (struct buf), sleep);
1962 if (bufp == NULL) {
1963 return (NULL);
1965 bioinit(bufp);
1966 } else {
1967 bufp = bp_mem;
1968 bioreset(bufp);
1971 #define BUF_CLONE_FLAGS (B_READ|B_WRITE|B_SHADOW|B_PHYS|B_PAGEIO|B_FAILFAST|\
1972 B_ABRWRITE)
1975 * The cloned buffer does not inherit the B_REMAPPED flag.
1977 bufp->b_flags = (bp->b_flags & BUF_CLONE_FLAGS) | B_BUSY;
1978 bufp->b_bcount = len;
1979 bufp->b_blkno = blkno;
1980 bufp->b_iodone = iodone;
1981 bufp->b_proc = bp->b_proc;
1982 bufp->b_edev = dev;
1983 bufp->b_file = bp->b_file;
1984 bufp->b_offset = bp->b_offset;
1986 if (bp->b_flags & B_SHADOW) {
1987 ASSERT(bp->b_shadow);
1988 ASSERT(bp->b_flags & B_PHYS);
1990 bufp->b_shadow = bp->b_shadow +
1991 btop(((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off);
1992 bufp->b_un.b_addr = (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
1993 if (bp->b_flags & B_REMAPPED)
1994 bufp->b_proc = NULL;
1995 } else {
1996 if (bp->b_flags & B_PAGEIO) {
1997 struct page *pp;
1998 off_t o;
1999 int i;
2001 pp = bp->b_pages;
2002 o = ((uintptr_t)bp->b_un.b_addr & PAGEOFFSET) + off;
2003 for (i = btop(o); i > 0; i--) {
2004 pp = pp->p_next;
2006 bufp->b_pages = pp;
2007 bufp->b_un.b_addr = (caddr_t)(o & PAGEOFFSET);
2008 } else {
2009 bufp->b_un.b_addr =
2010 (caddr_t)((uintptr_t)bp->b_un.b_addr + off);
2011 if (bp->b_flags & B_REMAPPED)
2012 bufp->b_proc = NULL;
2015 return (bufp);