4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
42 #include <sys/types32.h>
43 #include <sys/t_lock.h>
44 #include <sys/kstat.h>
45 #include <sys/stdbool.h>
52 * Each buffer in the pool is usually doubly linked into 2 lists:
53 * the device with which it is currently associated (always)
54 * and also on a list of blocks available for allocation
55 * for other use (usually).
56 * The latter list is kept in last-used order, and the two
57 * lists are doubly linked to make it easy to remove
58 * a buffer from one list when it was found by
59 * looking through the other.
60 * A buffer is on the available list, and is liable
61 * to be reassigned to another disk block, if and only
62 * if it is not marked BUSY. When a buffer is busy, the
63 * available-list pointers can be used for other purposes.
64 * Most drivers use the forward ptr as a link in their I/O active queue.
65 * A buffer header contains all the information required to perform I/O.
66 * Most of the routines which manipulate these things are in bio.c.
68 * There are a number of locks associated with the buffer management
70 * hbuf.b_lock: protects hash chains, buffer hdr freelists
71 * and delayed write freelist
72 * bfree_lock; protects the bfreelist structure
73 * bhdr_lock: protects the free header list
74 * blist_lock: protects b_list fields
75 * buf.b_sem: protects all remaining members in the buf struct
76 * buf.b_io: I/O synchronization variable
78 * A buffer header is never "locked" (b_sem) when it is on
79 * a "freelist" (bhdrlist or bfreelist avail lists).
82 int b_flags
; /* see defines below */
83 struct buf
*b_forw
; /* headed by d_tab of conf.c */
84 struct buf
*b_back
; /* " */
85 struct buf
*av_forw
; /* position on free list, */
86 struct buf
*av_back
; /* if not BUSY */
87 o_dev_t b_dev
; /* OLD major+minor device name */
88 size_t b_bcount
; /* transfer count */
90 caddr_t b_addr
; /* low order core address */
91 struct fs
*b_fs
; /* superblocks */
92 struct cg
*b_cg
; /* UFS cylinder group block */
93 struct dinode
*b_dino
; /* UFS ilist */
94 daddr32_t
*b_daddr
; /* disk blocks */
97 lldaddr_t _b_blkno
; /* block # on device (union) */
98 #define b_lblkno _b_blkno._f
100 #define b_blkno _b_blkno._f
102 #define b_blkno _b_blkno._p._l
105 char b_obs1
; /* obsolete */
106 size_t b_resid
; /* words not transferred after error */
107 clock_t b_start
; /* request start time */
108 struct proc
*b_proc
; /* process doing physical or swap I/O */
109 struct page
*b_pages
; /* page list for PAGEIO */
110 clock_t b_obs2
; /* obsolete */
111 /* Begin new stuff */
112 #define b_actf av_forw
113 #define b_actl av_back
114 #define b_active b_bcount
115 #define b_errcnt b_resid
116 size_t b_bufsize
; /* size of allocated buffer */
117 int (*b_iodone
)(struct buf
*); /* function called by iodone */
118 struct vnode
*b_vp
; /* vnode associated with block */
119 struct buf
*b_chain
; /* chain together all buffers here */
120 int b_obs3
; /* obsolete */
121 int b_error
; /* expanded error field */
122 void *b_private
; /* "opaque" driver private area */
123 dev_t b_edev
; /* expanded dev field */
124 ksema_t b_sem
; /* Exclusive access to buf */
125 ksema_t b_io
; /* I/O Synchronization */
126 struct buf
*b_list
; /* List of potential B_DELWRI bufs */
127 struct page
**b_shadow
; /* shadow page list */
128 void *b_dip
; /* device info pointer */
129 struct vnode
*b_file
; /* file associated with this buffer */
130 offset_t b_offset
; /* offset in file assoc. with buffer */
134 * Bufhd structures used at the head of the hashed buffer queues.
135 * We only need seven words for this, so this abbreviated
136 * definition saves some space.
139 int b_flags
; /* not used, needed for consistency */
140 struct buf
*b_forw
, *b_back
; /* queue of unit queues */
141 struct buf
*av_forw
, *av_back
; /* queue of bufs for this unit */
142 o_dev_t b_dev
; /* OLD major+minor device name */
143 size_t b_bcount
; /* transfer count */
148 * Statistics on the buffer cache
151 kstat_named_t bio_lookup
; /* requests to assign buffer */
152 kstat_named_t bio_hit
; /* buffer already associated with blk */
153 kstat_named_t bio_bufwant
; /* kmem_allocs NOSLEEP failed new buf */
154 kstat_named_t bio_bufwait
; /* kmem_allocs with KM_SLEEP for buf */
155 kstat_named_t bio_bufbusy
; /* buffer locked by someone else */
156 kstat_named_t bio_bufdup
; /* duplicate buffer found for block */
160 * These flags are kept in b_flags.
161 * The first group is part of the DDI
163 #define B_BUSY 0x0001 /* not on av_forw/back list */
164 #define B_DONE 0x0002 /* transaction finished */
165 #define B_ERROR 0x0004 /* transaction aborted */
166 #define B_PAGEIO 0x0010 /* do I/O to pages on bp->p_pages */
167 #define B_PHYS 0x0020 /* Physical IO potentially using UNIBUS map */
168 #define B_READ 0x0040 /* read when I/O occurs */
169 #define B_WRITE 0x0100 /* non-read pseudo-flag */
171 /* Not part of the DDI */
172 #define B_WANTED 0x0080 /* issue wakeup when BUSY goes off */
173 #define B_AGE 0x000200 /* delayed write for correct aging */
174 #define B_ASYNC 0x000400 /* don't wait for I/O completion */
175 #define B_DELWRI 0x000800 /* delayed write-wait til buf needed */
176 #define B_STALE 0x001000 /* on av_* list; invalid contents */
177 #define B_DONTNEED 0x002000 /* after write, need not be cached */
178 #define B_REMAPPED 0x004000 /* buffer is kernel addressable */
179 #define B_FREE 0x008000 /* free page when done */
180 #define B_INVAL 0x010000 /* destroy page when done */
181 #define B_FORCE 0x020000 /* semi-permanent removal from cache */
182 #define B_NOCACHE 0x080000 /* don't cache block when released */
183 #define B_TRUNC 0x100000 /* truncate page without I/O */
184 #define B_SHADOW 0x200000 /* is b_shadow field valid? */
185 #define B_RETRYWRI 0x400000 /* retry write til works or bfinval */
186 #define B_FAILFAST 0x1000000 /* Fail promptly if device goes away */
187 #define B_STARTED 0x2000000 /* io:::start probe called for buf */
188 #define B_ABRWRITE 0x4000000 /* Application based recovery active */
189 #define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */
192 * There is some confusion over the meaning of B_FREE and B_INVAL and what
193 * the use of one over the other implies.
195 * In both cases, when we are done with the page (buffer) we want to free
196 * up the page. In the case of B_FREE, the page will go to the cachelist.
197 * In the case of B_INVAL, the page will be destroyed (hashed out of it's
198 * vnode) and placed on the freelist. Beyond this, there is no difference
199 * between the sole use of these two flags. In both cases, IO will be done
200 * if the page is not yet committed to storage.
202 * In order to discard pages without writing them back, (B_INVAL | B_TRUNC)
205 * Use (B_INVAL | B_FORCE) to force the page to be destroyed even if we
206 * could not successfuly write out the page.
210 * Insq/Remq for the buffer hash lists.
212 #define bremhash(bp) \
214 ASSERT((bp)->b_forw != NULL); \
215 ASSERT((bp)->b_back != NULL); \
216 (bp)->b_back->b_forw = (bp)->b_forw; \
217 (bp)->b_forw->b_back = (bp)->b_back; \
218 (bp)->b_forw = (bp)->b_back = NULL; \
220 #define binshash(bp, dp) \
222 ASSERT((bp)->b_forw == NULL); \
223 ASSERT((bp)->b_back == NULL); \
224 ASSERT((dp)->b_forw != NULL); \
225 ASSERT((dp)->b_back != NULL); \
226 (bp)->b_forw = (dp)->b_forw; \
227 (bp)->b_back = (dp); \
228 (dp)->b_forw->b_back = (bp); \
229 (dp)->b_forw = (bp); \
234 * The hash structure maintains two lists:
236 * 1) The hash list of buffers (b_forw & b_back)
237 * 2) The LRU free list of buffers on this hash bucket (av_forw & av_back)
239 * The dwbuf structure keeps a list of delayed write buffers per hash bucket
240 * hence there are exactly the same number of dwbuf structures as there are
241 * the hash buckets (hbuf structures) in the system.
243 * The number of buffers on the freelist may not be equal to the number of
244 * buffers on the hash list. That is because when buffers are busy they are
245 * taken off the freelist but not off the hash list. "b_length" field keeps
246 * track of the number of free buffers (including delayed writes ones) on
247 * the hash bucket. The "b_lock" mutex protects the free list as well as
248 * the hash list. It also protects the counter "b_length".
250 * Enties b_forw, b_back, av_forw & av_back must be at the same offset
251 * as the ones in buf structure.
256 struct buf
*b_forw
; /* hash list forw pointer */
257 struct buf
*b_back
; /* hash list back pointer */
259 struct buf
*av_forw
; /* free list forw pointer */
260 struct buf
*av_back
; /* free list back pointer */
262 int b_length
; /* # of entries on free list */
263 kmutex_t b_lock
; /* lock to protect this structure */
268 * The delayed list pointer entries should match with the buf strcuture.
271 int b_flags
; /* not used */
273 struct buf
*b_forw
; /* not used */
274 struct buf
*b_back
; /* not used */
276 struct buf
*av_forw
; /* delayed write forw pointer */
277 struct buf
*av_back
; /* delayed write back pointer */
282 * Unlink a buffer from the available (free or delayed write) list and mark
283 * it busy (internal interface).
285 #define notavail(bp) \
287 ASSERT(SEMA_HELD(&bp->b_sem)); \
288 ASSERT((bp)->av_forw != NULL); \
289 ASSERT((bp)->av_back != NULL); \
290 ASSERT((bp)->av_forw != (bp)); \
291 ASSERT((bp)->av_back != (bp)); \
292 (bp)->av_back->av_forw = (bp)->av_forw; \
293 (bp)->av_forw->av_back = (bp)->av_back; \
294 (bp)->b_flags |= B_BUSY; \
295 (bp)->av_forw = (bp)->av_back = NULL; \
299 extern struct hbuf
*hbuf
; /* Hash table */
300 extern struct dwbuf
*dwbuf
; /* delayed write hash table */
301 extern struct buf
*buf
; /* The buffer pool itself */
302 extern struct buf bfreelist
; /* head of available list */
304 extern void (*bio_lufs_strategy
)(void *, buf_t
*); /* UFS Logging */
305 extern void (*bio_snapshot_strategy
)(void *, buf_t
*); /* UFS snapshots */
307 int bcheck(dev_t
, struct buf
*);
308 int iowait(struct buf
*);
309 int hash2ints(int x
, int y
);
311 int biowait(struct buf
*);
312 int biomodified(struct buf
*);
313 int geterror(struct buf
*);
314 void minphys(struct buf
*);
316 * ufsvfsp is declared as a void * to avoid having everyone that uses
317 * this header file include sys/fs/ufs_inode.h.
319 void bwrite_common(void *ufsvfsp
, struct buf
*, bool force_wait
,
320 bool do_relse
, int clear_flags
);
321 void bdwrite(struct buf
*);
322 void bawrite(struct buf
*);
323 void brelse(struct buf
*);
324 void iodone(struct buf
*);
325 void clrbuf(struct buf
*);
327 void blkflush(dev_t
, daddr_t
);
329 int bfinval(dev_t
, int);
331 void biodone(struct buf
*);
332 void bioinit(struct buf
*);
333 void biofini(struct buf
*);
334 void bp_mapin(struct buf
*);
335 void *bp_mapin_common(struct buf
*, int);
336 void bp_mapout(struct buf
*);
337 int bp_copyin(struct buf
*, void *, offset_t
, size_t);
338 int bp_copyout(void *, struct buf
*, offset_t
, size_t);
339 void bp_init(size_t, uint_t
);
340 int bp_color(struct buf
*);
341 void pageio_done(struct buf
*);
342 struct buf
*bread_common(void *, dev_t
, daddr_t
, long);
343 struct buf
*breada(dev_t
, daddr_t
, daddr_t
, long);
344 struct buf
*getblk_common(void *, dev_t
, daddr_t
, long, bool);
345 struct buf
*ngeteblk(long);
346 struct buf
*geteblk(void);
347 struct buf
*pageio_setup(struct page
*, size_t, struct vnode
*, int);
348 void bioerror(struct buf
*bp
, int error
);
349 void bioreset(struct buf
*bp
);
350 struct buf
*bioclone(struct buf
*, off_t
, size_t, dev_t
, daddr_t
,
351 int (*)(struct buf
*), struct buf
*, int);
352 size_t biosize(void);
355 * B_RETRYWRI is not included in clear_flags for bwrite(), bwrite2(),
356 * or brwrite() so that the retry operation is persistent until the
357 * write either succeeds or the buffer is bfinval()'d.
360 /* Read in (if necessary) the block and return a buffer pointer. */
361 static inline struct buf
*bread(dev_t dev
, daddr_t blkno
, long bsize
)
363 return bread_common(NULL
, dev
, blkno
, bsize
);
367 * Write the buffer, waiting for completion (unless B_ASYNC is set).
368 * Then release the buffer.
370 static inline void bwrite(struct buf
*bp
)
372 bwrite_common(NULL
, bp
, false, true,
373 (B_READ
| B_DONE
| B_ERROR
| B_DELWRI
));
377 * Write the buffer, waiting for completion.
378 * But don't release the buffer afterwards.
380 static inline void bwrite2(struct buf
*bp
)
382 bwrite_common(NULL
, bp
, true, false,
383 (B_READ
| B_DONE
| B_ERROR
| B_DELWRI
));
387 * Assign a buffer for the given block. If the appropriate
388 * block is already associated, return it; otherwise search
389 * for the oldest non-busy buffer and reassign it.
391 static inline struct buf
*getblk(dev_t dev
, daddr_t blkno
, long bsize
)
393 return getblk_common(NULL
, dev
, blkno
, bsize
, false);
397 * Same as bdwrite() except write failures are retried.
399 static inline void bdrwrite(struct buf
*bp
)
401 bp
->b_flags
|= B_RETRYWRI
;
404 #endif /* defined(_KERNEL) */
410 #endif /* _SYS_BUF_H */