3 * The Regents of the University of California. All rights reserved.
4 * Modifications/enhancements:
5 * Copyright (c) 1995 John S. Dyson. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
37 #include "opt_debug_cluster.h"
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
45 #include <sys/vnode.h>
46 #include <sys/malloc.h>
47 #include <sys/mount.h>
48 #include <sys/resourcevar.h>
49 #include <sys/vmmeter.h>
51 #include <vm/vm_object.h>
52 #include <vm/vm_page.h>
53 #include <sys/sysctl.h>
55 #if defined(CLUSTERDEBUG)
56 static int rcluster
= 0;
57 SYSCTL_INT(_debug
, OID_AUTO
, rcluster
, CTLFLAG_RW
, &rcluster
, 0,
58 "Debug VFS clustering code");
61 static MALLOC_DEFINE(M_SEGMENT
, "cl_savebuf", "cluster_save buffer");
63 static struct cluster_save
*
64 cluster_collectbufs(struct vnode
*vp
, struct buf
*last_bp
);
66 cluster_rbuild(struct vnode
*vp
, u_quad_t filesize
, daddr_t lbn
,
67 daddr_t blkno
, long size
, int run
, struct buf
*fbp
);
68 static void cluster_callback(struct buf
*);
70 static int write_behind
= 1;
71 SYSCTL_INT(_vfs
, OID_AUTO
, write_behind
, CTLFLAG_RW
, &write_behind
, 0,
72 "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
74 static int read_max
= 8;
75 SYSCTL_INT(_vfs
, OID_AUTO
, read_max
, CTLFLAG_RW
, &read_max
, 0,
76 "Cluster read-ahead max block count");
78 /* Page expended to mark partially backed buffers */
79 extern vm_page_t bogus_page
;
82 * Read data to a buf, including read-ahead if we find this to be beneficial.
83 * cluster_read replaces bread.
86 cluster_read(vp
, filesize
, lblkno
, size
, cred
, totread
, seqcount
, bpp
)
96 struct buf
*bp
, *rbp
, *reqbp
;
98 daddr_t blkno
, origblkno
;
107 * Try to limit the amount of read-ahead by a few
108 * ad-hoc parameters. This needs work!!!
110 racluster
= vp
->v_mount
->mnt_iosize_max
/ size
;
112 maxra
= min(read_max
, maxra
);
113 maxra
= min(nbuf
/8, maxra
);
114 if (((u_quad_t
)(lblkno
+ maxra
+ 1) * size
) > filesize
)
115 maxra
= (filesize
/ size
) - lblkno
;
118 * get the requested block
120 *bpp
= reqbp
= bp
= getblk(vp
, lblkno
, size
, 0, 0, 0);
124 * if it is in the cache, then check to see if the reads have been
125 * sequential. If they have, then try some read-ahead, otherwise
126 * back-off on prospective read-aheads.
128 if (bp
->b_flags
& B_CACHE
) {
131 } else if ((bp
->b_flags
& B_RAM
) == 0) {
134 bp
->b_flags
&= ~B_RAM
;
136 for (i
= 1; i
< maxra
; i
++) {
138 * Stop if the buffer does not exist or it
139 * is invalid (about to go away?)
141 rbp
= gbincore(&vp
->v_bufobj
, lblkno
+i
);
142 if (rbp
== NULL
|| (rbp
->b_flags
& B_INVAL
))
146 * Set another read-ahead mark so we know
147 * to check again. (If we can lock the
148 * buffer without waiting)
150 if ((((i
% racluster
) == (racluster
- 1)) ||
152 && (0 == BUF_LOCK(rbp
,
153 LK_EXCLUSIVE
| LK_NOWAIT
, NULL
))) {
154 rbp
->b_flags
|= B_RAM
;
166 * If it isn't in the cache, then get a chunk from
167 * disk if sequential, otherwise just get the block.
170 off_t firstread
= bp
->b_offset
;
173 KASSERT(bp
->b_offset
!= NOOFFSET
,
174 ("cluster_read: no buffer offset"));
179 * Compute the total number of blocks that we should read
182 if (firstread
+ totread
> filesize
)
183 totread
= filesize
- firstread
;
184 nblks
= howmany(totread
, size
);
185 if (nblks
> racluster
)
189 * Now compute the number of contiguous blocks.
192 error
= VOP_BMAP(vp
, lblkno
, NULL
,
193 &blkno
, &ncontig
, NULL
);
195 * If this failed to map just do the original block.
197 if (error
|| blkno
== -1)
202 * If we have contiguous data available do a cluster
203 * otherwise just read the requested block.
206 /* Account for our first block. */
207 ncontig
= min(ncontig
+ 1, nblks
);
210 bp
= cluster_rbuild(vp
, filesize
, lblkno
,
211 blkno
, size
, nblks
, bp
);
212 lblkno
+= (bp
->b_bufsize
/ size
);
214 bp
->b_flags
|= B_RAM
;
215 bp
->b_iocmd
= BIO_READ
;
221 * handle the synchronous read so that it is available ASAP.
224 if ((bp
->b_flags
& B_CLUSTER
) == 0) {
225 vfs_busy_pages(bp
, 0);
227 bp
->b_flags
&= ~B_INVAL
;
228 bp
->b_ioflags
&= ~BIO_ERROR
;
229 if ((bp
->b_flags
& B_ASYNC
) || bp
->b_iodone
!= NULL
)
231 bp
->b_iooffset
= dbtob(bp
->b_blkno
);
233 curthread
->td_ru
.ru_inblock
++;
237 * If we have been doing sequential I/O, then do some read-ahead.
239 while (lblkno
< (origblkno
+ maxra
)) {
240 error
= VOP_BMAP(vp
, lblkno
, NULL
, &blkno
, &ncontig
, NULL
);
248 * We could throttle ncontig here by maxra but we might as
249 * well read the data if it is contiguous. We're throttled
250 * by racluster anyway.
253 ncontig
= min(ncontig
+ 1, racluster
);
254 rbp
= cluster_rbuild(vp
, filesize
, lblkno
, blkno
,
255 size
, ncontig
, NULL
);
256 lblkno
+= (rbp
->b_bufsize
/ size
);
257 if (rbp
->b_flags
& B_DELWRI
) {
262 rbp
= getblk(vp
, lblkno
, size
, 0, 0, 0);
264 if (rbp
->b_flags
& B_DELWRI
) {
268 rbp
->b_flags
|= B_ASYNC
| B_RAM
;
269 rbp
->b_iocmd
= BIO_READ
;
270 rbp
->b_blkno
= blkno
;
272 if (rbp
->b_flags
& B_CACHE
) {
273 rbp
->b_flags
&= ~B_ASYNC
;
277 if ((rbp
->b_flags
& B_CLUSTER
) == 0) {
278 vfs_busy_pages(rbp
, 0);
280 rbp
->b_flags
&= ~B_INVAL
;
281 rbp
->b_ioflags
&= ~BIO_ERROR
;
282 if ((rbp
->b_flags
& B_ASYNC
) || rbp
->b_iodone
!= NULL
)
284 rbp
->b_iooffset
= dbtob(rbp
->b_blkno
);
286 curthread
->td_ru
.ru_inblock
++;
290 return (bufwait(reqbp
));
296 * If blocks are contiguous on disk, use this to provide clustered
297 * read ahead. We will read as many blocks as possible sequentially
298 * and then parcel them up into logical blocks in the buffer hash table.
301 cluster_rbuild(vp
, filesize
, lbn
, blkno
, size
, run
, fbp
)
311 struct buf
*bp
, *tbp
;
315 KASSERT(size
== vp
->v_mount
->mnt_stat
.f_iosize
,
316 ("cluster_rbuild: size %ld != filesize %jd\n",
317 size
, (intmax_t)vp
->v_mount
->mnt_stat
.f_iosize
));
322 while ((u_quad_t
) size
* (lbn
+ run
) > filesize
) {
328 tbp
->b_iocmd
= BIO_READ
;
330 tbp
= getblk(vp
, lbn
, size
, 0, 0, 0);
331 if (tbp
->b_flags
& B_CACHE
)
333 tbp
->b_flags
|= B_ASYNC
| B_RAM
;
334 tbp
->b_iocmd
= BIO_READ
;
336 tbp
->b_blkno
= blkno
;
337 if( (tbp
->b_flags
& B_MALLOC
) ||
338 ((tbp
->b_flags
& B_VMIO
) == 0) || (run
<= 1) )
341 bp
= trypbuf(&cluster_pbuf_freecnt
);
346 * We are synthesizing a buffer out of vm_page_t's, but
347 * if the block size is not page aligned then the starting
348 * address may not be either. Inherit the b_data offset
349 * from the original buffer.
351 bp
->b_data
= (char *)((vm_offset_t
)bp
->b_data
|
352 ((vm_offset_t
)tbp
->b_data
& PAGE_MASK
));
353 bp
->b_flags
= B_ASYNC
| B_CLUSTER
| B_VMIO
;
354 bp
->b_iocmd
= BIO_READ
;
355 bp
->b_iodone
= cluster_callback
;
358 bp
->b_offset
= tbp
->b_offset
;
359 KASSERT(bp
->b_offset
!= NOOFFSET
, ("cluster_rbuild: no buffer offset"));
362 TAILQ_INIT(&bp
->b_cluster
.cluster_head
);
370 for (bn
= blkno
, i
= 0; i
< run
; ++i
, bn
+= inc
) {
372 if ((bp
->b_npages
* PAGE_SIZE
) +
373 round_page(size
) > vp
->v_mount
->mnt_iosize_max
) {
377 tbp
= getblk(vp
, lbn
+ i
, size
, 0, 0, GB_LOCK_NOWAIT
);
379 /* Don't wait around for locked bufs. */
384 * Stop scanning if the buffer is fully valid
385 * (marked B_CACHE), or locked (may be doing a
386 * background write), or if the buffer is not
387 * VMIO backed. The clustering code can only deal
388 * with VMIO-backed buffers.
391 if ((tbp
->b_vflags
& BV_BKGRDINPROG
) ||
392 (tbp
->b_flags
& B_CACHE
) ||
393 (tbp
->b_flags
& B_VMIO
) == 0) {
401 * The buffer must be completely invalid in order to
402 * take part in the cluster. If it is partially valid
405 VM_OBJECT_LOCK(tbp
->b_bufobj
->bo_object
);
406 for (j
= 0;j
< tbp
->b_npages
; j
++) {
407 VM_OBJECT_LOCK_ASSERT(tbp
->b_pages
[j
]->object
,
409 if (tbp
->b_pages
[j
]->valid
)
412 VM_OBJECT_UNLOCK(tbp
->b_bufobj
->bo_object
);
413 if (j
!= tbp
->b_npages
) {
419 * Set a read-ahead mark as appropriate
421 if ((fbp
&& (i
== 1)) || (i
== (run
- 1)))
422 tbp
->b_flags
|= B_RAM
;
425 * Set the buffer up for an async read (XXX should
426 * we do this only if we do not wind up brelse()ing?).
427 * Set the block number if it isn't set, otherwise
428 * if it is make sure it matches the block number we
431 tbp
->b_flags
|= B_ASYNC
;
432 tbp
->b_iocmd
= BIO_READ
;
433 if (tbp
->b_blkno
== tbp
->b_lblkno
) {
435 } else if (tbp
->b_blkno
!= bn
) {
441 * XXX fbp from caller may not be B_ASYNC, but we are going
442 * to biodone() it in cluster_callback() anyway
445 TAILQ_INSERT_TAIL(&bp
->b_cluster
.cluster_head
,
446 tbp
, b_cluster
.cluster_entry
);
447 VM_OBJECT_LOCK(tbp
->b_bufobj
->bo_object
);
448 for (j
= 0; j
< tbp
->b_npages
; j
+= 1) {
452 vm_object_pip_add(m
->object
, 1);
453 if ((bp
->b_npages
== 0) ||
454 (bp
->b_pages
[bp
->b_npages
-1] != m
)) {
455 bp
->b_pages
[bp
->b_npages
] = m
;
458 if ((m
->valid
& VM_PAGE_BITS_ALL
) == VM_PAGE_BITS_ALL
)
459 tbp
->b_pages
[j
] = bogus_page
;
461 VM_OBJECT_UNLOCK(tbp
->b_bufobj
->bo_object
);
463 * XXX shouldn't this be += size for both, like in
466 * Don't inherit tbp->b_bufsize as it may be larger due to
467 * a non-page-aligned size. Instead just aggregate using
470 if (tbp
->b_bcount
!= size
)
471 printf("warning: tbp->b_bcount wrong %ld vs %ld\n", tbp
->b_bcount
, size
);
472 if (tbp
->b_bufsize
!= size
)
473 printf("warning: tbp->b_bufsize wrong %ld vs %ld\n", tbp
->b_bufsize
, size
);
474 bp
->b_bcount
+= size
;
475 bp
->b_bufsize
+= size
;
479 * Fully valid pages in the cluster are already good and do not need
480 * to be re-read from disk. Replace the page with bogus_page
482 VM_OBJECT_LOCK(bp
->b_bufobj
->bo_object
);
483 for (j
= 0; j
< bp
->b_npages
; j
++) {
484 VM_OBJECT_LOCK_ASSERT(bp
->b_pages
[j
]->object
, MA_OWNED
);
485 if ((bp
->b_pages
[j
]->valid
& VM_PAGE_BITS_ALL
) ==
487 bp
->b_pages
[j
] = bogus_page
;
490 VM_OBJECT_UNLOCK(bp
->b_bufobj
->bo_object
);
491 if (bp
->b_bufsize
> bp
->b_kvasize
)
492 panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
493 bp
->b_bufsize
, bp
->b_kvasize
);
494 bp
->b_kvasize
= bp
->b_bufsize
;
496 pmap_qenter(trunc_page((vm_offset_t
) bp
->b_data
),
497 (vm_page_t
*)bp
->b_pages
, bp
->b_npages
);
502 * Cleanup after a clustered read or write.
503 * This is complicated by the fact that any of the buffers might have
504 * extra memory (if there were no empty buffer headers at allocbuf time)
505 * that we will need to shift around.
511 struct buf
*nbp
, *tbp
;
515 * Must propogate errors to all the components.
517 if (bp
->b_ioflags
& BIO_ERROR
)
520 pmap_qremove(trunc_page((vm_offset_t
) bp
->b_data
), bp
->b_npages
);
522 * Move memory from the large cluster buffer into the component
523 * buffers and mark IO as done on these.
525 for (tbp
= TAILQ_FIRST(&bp
->b_cluster
.cluster_head
);
527 nbp
= TAILQ_NEXT(&tbp
->b_cluster
, cluster_entry
);
529 tbp
->b_ioflags
|= BIO_ERROR
;
530 tbp
->b_error
= error
;
532 tbp
->b_dirtyoff
= tbp
->b_dirtyend
= 0;
533 tbp
->b_flags
&= ~B_INVAL
;
534 tbp
->b_ioflags
&= ~BIO_ERROR
;
536 * XXX the bdwrite()/bqrelse() issued during
537 * cluster building clears B_RELBUF (see bqrelse()
538 * comment). If direct I/O was specified, we have
539 * to restore it here to allow the buffer and VM
542 if (tbp
->b_flags
& B_DIRECT
)
543 tbp
->b_flags
|= B_RELBUF
;
548 relpbuf(bp
, &cluster_pbuf_freecnt
);
554 * Implement modified write build for cluster.
556 * write_behind = 0 write behind disabled
557 * write_behind = 1 write behind normal (default)
558 * write_behind = 2 write behind backed-off
562 cluster_wbuild_wb(struct vnode
*vp
, long size
, daddr_t start_lbn
, int len
)
566 switch(write_behind
) {
573 r
= cluster_wbuild(vp
, size
, start_lbn
, len
);
583 * Do clustered write for FFS.
586 * 1. Write is not sequential (write asynchronously)
587 * Write is sequential:
588 * 2. beginning of cluster - begin cluster
589 * 3. middle of a cluster - add to cluster
590 * 4. end of a cluster - asynchronously write cluster
593 cluster_write(struct vnode
*vp
, struct buf
*bp
, u_quad_t filesize
, int seqcount
)
596 int maxclen
, cursize
;
600 if (vp
->v_type
== VREG
) {
601 async
= vp
->v_mount
->mnt_kern_flag
& MNTK_ASYNC
;
602 lblocksize
= vp
->v_mount
->mnt_stat
.f_iosize
;
605 lblocksize
= bp
->b_bufsize
;
608 KASSERT(bp
->b_offset
!= NOOFFSET
, ("cluster_write: no buffer offset"));
610 /* Initialize vnode to beginning of file. */
612 vp
->v_lasta
= vp
->v_clen
= vp
->v_cstart
= vp
->v_lastw
= 0;
614 if (vp
->v_clen
== 0 || lbn
!= vp
->v_lastw
+ 1 ||
615 (bp
->b_blkno
!= vp
->v_lasta
+ btodb(lblocksize
))) {
616 maxclen
= vp
->v_mount
->mnt_iosize_max
/ lblocksize
- 1;
617 if (vp
->v_clen
!= 0) {
619 * Next block is not sequential.
621 * If we are not writing at end of file, the process
622 * seeked to another point in the file since its last
623 * write, or we have reached our maximum cluster size,
624 * then push the previous cluster. Otherwise try
625 * reallocating to make it sequential.
627 * Change to algorithm: only push previous cluster if
628 * it was sequential from the point of view of the
629 * seqcount heuristic, otherwise leave the buffer
630 * intact so we can potentially optimize the I/O
631 * later on in the buf_daemon or update daemon
634 cursize
= vp
->v_lastw
- vp
->v_cstart
+ 1;
635 if (((u_quad_t
) bp
->b_offset
+ lblocksize
) != filesize
||
636 lbn
!= vp
->v_lastw
+ 1 || vp
->v_clen
<= cursize
) {
637 if (!async
&& seqcount
> 0) {
638 cluster_wbuild_wb(vp
, lblocksize
,
639 vp
->v_cstart
, cursize
);
642 struct buf
**bpp
, **endbp
;
643 struct cluster_save
*buflist
;
645 buflist
= cluster_collectbufs(vp
, bp
);
646 endbp
= &buflist
->bs_children
647 [buflist
->bs_nchildren
- 1];
648 if (VOP_REALLOCBLKS(vp
, buflist
)) {
650 * Failed, push the previous cluster
651 * if *really* writing sequentially
652 * in the logical file (seqcount > 1),
653 * otherwise delay it in the hopes that
654 * the low level disk driver can
655 * optimize the write ordering.
657 for (bpp
= buflist
->bs_children
;
660 free(buflist
, M_SEGMENT
);
662 cluster_wbuild_wb(vp
,
663 lblocksize
, vp
->v_cstart
,
668 * Succeeded, keep building cluster.
670 for (bpp
= buflist
->bs_children
;
673 free(buflist
, M_SEGMENT
);
675 vp
->v_lasta
= bp
->b_blkno
;
681 * Consider beginning a cluster. If at end of file, make
682 * cluster as large as possible, otherwise find size of
685 if ((vp
->v_type
== VREG
) &&
686 ((u_quad_t
) bp
->b_offset
+ lblocksize
) != filesize
&&
687 (bp
->b_blkno
== bp
->b_lblkno
) &&
688 (VOP_BMAP(vp
, lbn
, NULL
, &bp
->b_blkno
, &maxclen
, NULL
) ||
689 bp
->b_blkno
== -1)) {
692 vp
->v_lasta
= bp
->b_blkno
;
693 vp
->v_cstart
= lbn
+ 1;
697 vp
->v_clen
= maxclen
;
698 if (!async
&& maxclen
== 0) { /* I/O not contiguous */
699 vp
->v_cstart
= lbn
+ 1;
701 } else { /* Wait for rest of cluster */
705 } else if (lbn
== vp
->v_cstart
+ vp
->v_clen
) {
707 * At end of cluster, write it out if seqcount tells us we
708 * are operating sequentially, otherwise let the buf or
709 * update daemon handle it.
713 cluster_wbuild_wb(vp
, lblocksize
, vp
->v_cstart
, vp
->v_clen
+ 1);
715 vp
->v_cstart
= lbn
+ 1;
716 } else if (vm_page_count_severe()) {
718 * We are low on memory, get it going NOW
723 * In the middle of a cluster, so just delay the I/O for now.
728 vp
->v_lasta
= bp
->b_blkno
;
733 * This is an awful lot like cluster_rbuild...wish they could be combined.
734 * The last lbn argument is the current block on which I/O is being
735 * performed. Check to see that it doesn't fall in the middle of
736 * the current block (if last_bp == NULL).
739 cluster_wbuild(vp
, size
, start_lbn
, len
)
745 struct buf
*bp
, *tbp
;
748 int totalwritten
= 0;
749 int dbsize
= btodb(size
);
754 * If the buffer is not delayed-write (i.e. dirty), or it
755 * is delayed-write but either locked or inval, it cannot
756 * partake in the clustered write.
759 if ((tbp
= gbincore(&vp
->v_bufobj
, start_lbn
)) == NULL
||
760 (tbp
->b_vflags
& BV_BKGRDINPROG
)) {
767 LK_EXCLUSIVE
| LK_NOWAIT
| LK_INTERLOCK
, BO_MTX(bo
))) {
772 if ((tbp
->b_flags
& (B_INVAL
| B_DELWRI
)) != B_DELWRI
) {
778 if (tbp
->b_pin_count
> 0) {
785 tbp
->b_flags
&= ~B_DONE
;
788 * Extra memory in the buffer, punt on this buffer.
789 * XXX we could handle this in most cases, but we would
790 * have to push the extra memory down to after our max
791 * possible cluster size and then potentially pull it back
792 * up if the cluster was terminated prematurely--too much
795 if (((tbp
->b_flags
& (B_CLUSTEROK
| B_MALLOC
| B_VMIO
)) !=
796 (B_CLUSTEROK
| B_VMIO
)) ||
797 (tbp
->b_bcount
!= tbp
->b_bufsize
) ||
798 (tbp
->b_bcount
!= size
) ||
800 ((bp
= getpbuf(&cluster_pbuf_freecnt
)) == NULL
)) {
801 totalwritten
+= tbp
->b_bufsize
;
809 * We got a pbuf to make the cluster in.
812 TAILQ_INIT(&bp
->b_cluster
.cluster_head
);
816 if (tbp
->b_wcred
!= NOCRED
)
817 bp
->b_wcred
= crhold(tbp
->b_wcred
);
819 bp
->b_blkno
= tbp
->b_blkno
;
820 bp
->b_lblkno
= tbp
->b_lblkno
;
821 bp
->b_offset
= tbp
->b_offset
;
824 * We are synthesizing a buffer out of vm_page_t's, but
825 * if the block size is not page aligned then the starting
826 * address may not be either. Inherit the b_data offset
827 * from the original buffer.
829 bp
->b_data
= (char *)((vm_offset_t
)bp
->b_data
|
830 ((vm_offset_t
)tbp
->b_data
& PAGE_MASK
));
831 bp
->b_flags
|= B_CLUSTER
|
832 (tbp
->b_flags
& (B_VMIO
| B_NEEDCOMMIT
));
833 bp
->b_iodone
= cluster_callback
;
836 * From this location in the file, scan forward to see
837 * if there are buffers with adjacent data that need to
838 * be written as well.
840 for (i
= 0; i
< len
; ++i
, ++start_lbn
) {
841 if (i
!= 0) { /* If not the first buffer */
843 * If the adjacent data is not even in core it
844 * can't need to be written.
847 if ((tbp
= gbincore(bo
, start_lbn
)) == NULL
||
848 (tbp
->b_vflags
& BV_BKGRDINPROG
)) {
854 * If it IS in core, but has different
855 * characteristics, or is locked (which
856 * means it could be undergoing a background
857 * I/O or be in a weird state), then don't
861 LK_EXCLUSIVE
| LK_NOWAIT
| LK_INTERLOCK
,
865 if ((tbp
->b_flags
& (B_VMIO
| B_CLUSTEROK
|
866 B_INVAL
| B_DELWRI
| B_NEEDCOMMIT
))
867 != (B_DELWRI
| B_CLUSTEROK
|
868 (bp
->b_flags
& (B_VMIO
| B_NEEDCOMMIT
))) ||
869 tbp
->b_wcred
!= bp
->b_wcred
) {
875 * Check that the combined cluster
876 * would make sense with regard to pages
877 * and would not be too large
879 if ((tbp
->b_bcount
!= size
) ||
880 ((bp
->b_blkno
+ (dbsize
* i
)) !=
882 ((tbp
->b_npages
+ bp
->b_npages
) >
883 (vp
->v_mount
->mnt_iosize_max
/ PAGE_SIZE
))) {
889 * Do not pull in pinned buffers.
891 if (tbp
->b_pin_count
> 0) {
897 * Ok, it's passed all the tests,
898 * so remove it from the free list
899 * and mark it busy. We will use it.
902 tbp
->b_flags
&= ~B_DONE
;
903 } /* end of code for non-first buffers only */
905 * If the IO is via the VM then we do some
906 * special VM hackery (yuck). Since the buffer's
907 * block size may not be page-aligned it is possible
908 * for a page to be shared between two buffers. We
909 * have to get rid of the duplication when building
912 if (tbp
->b_flags
& B_VMIO
) {
915 VM_OBJECT_LOCK(tbp
->b_bufobj
->bo_object
);
916 if (i
!= 0) { /* if not first buffer */
917 for (j
= 0; j
< tbp
->b_npages
; j
+= 1) {
919 if (m
->oflags
& VPO_BUSY
) {
927 for (j
= 0; j
< tbp
->b_npages
; j
+= 1) {
930 vm_object_pip_add(m
->object
, 1);
931 if ((bp
->b_npages
== 0) ||
932 (bp
->b_pages
[bp
->b_npages
- 1] != m
)) {
933 bp
->b_pages
[bp
->b_npages
] = m
;
937 VM_OBJECT_UNLOCK(tbp
->b_bufobj
->bo_object
);
939 bp
->b_bcount
+= size
;
940 bp
->b_bufsize
+= size
;
942 tbp
->b_flags
&= ~B_DONE
;
943 tbp
->b_ioflags
&= ~BIO_ERROR
;
944 tbp
->b_flags
|= B_ASYNC
;
945 tbp
->b_iocmd
= BIO_WRITE
;
946 reassignbuf(tbp
); /* put on clean list */
947 bufobj_wref(tbp
->b_bufobj
);
949 TAILQ_INSERT_TAIL(&bp
->b_cluster
.cluster_head
,
950 tbp
, b_cluster
.cluster_entry
);
953 pmap_qenter(trunc_page((vm_offset_t
) bp
->b_data
),
954 (vm_page_t
*) bp
->b_pages
, bp
->b_npages
);
955 if (bp
->b_bufsize
> bp
->b_kvasize
)
957 "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
958 bp
->b_bufsize
, bp
->b_kvasize
);
959 bp
->b_kvasize
= bp
->b_bufsize
;
960 totalwritten
+= bp
->b_bufsize
;
962 bp
->b_dirtyend
= bp
->b_bufsize
;
971 * Collect together all the buffers in a cluster.
972 * Plus add one additional buffer.
974 static struct cluster_save
*
975 cluster_collectbufs(vp
, last_bp
)
979 struct cluster_save
*buflist
;
984 len
= vp
->v_lastw
- vp
->v_cstart
+ 1;
985 buflist
= malloc(sizeof(struct buf
*) * (len
+ 1) + sizeof(*buflist
),
986 M_SEGMENT
, M_WAITOK
);
987 buflist
->bs_nchildren
= 0;
988 buflist
->bs_children
= (struct buf
**) (buflist
+ 1);
989 for (lbn
= vp
->v_cstart
, i
= 0; i
< len
; lbn
++, i
++) {
990 (void) bread(vp
, lbn
, last_bp
->b_bcount
, NOCRED
, &bp
);
991 buflist
->bs_children
[i
] = bp
;
992 if (bp
->b_blkno
== bp
->b_lblkno
)
993 VOP_BMAP(vp
, bp
->b_lblkno
, NULL
, &bp
->b_blkno
,
996 buflist
->bs_children
[i
] = bp
= last_bp
;
997 if (bp
->b_blkno
== bp
->b_lblkno
)
998 VOP_BMAP(vp
, bp
->b_lblkno
, NULL
, &bp
->b_blkno
, NULL
, NULL
);
999 buflist
->bs_nchildren
= i
+ 1;