kernel - Add new bufcache/VM consolidated API, fsx fixes for NFS
[dragonfly.git] / sys / kern / vfs_vm.c
blob5cae1308b894466128ce89a5d3a99375b877c829
1 /*
2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
36 * Implements new VFS/VM coherency functions. For conforming VFSs
37 * we treat the backing VM object slightly differently. Instead of
38 * maintaining a number of pages to exactly fit the size of the file
39 * we instead maintain pages to fit the entire contents of the last
40 * buffer cache buffer used by the file.
42 * For VFSs like NFS and HAMMER which use (generally speaking) fixed
43 * sized buffers this greatly reduces the complexity of VFS/VM interactions.
45 * Truncations no longer invalidate pages covered by the buffer cache
46 * beyond the file EOF which still fit within the file's last buffer.
47 * We simply unmap them and do not allow userland to fault them in.
49 * The VFS is no longer responsible for zero-filling buffers during a
50 * truncation, the last buffer will be automatically zero-filled by
51 * nvtruncbuf().
53 * This code is intended to (eventually) replace vtruncbuf() and
54 * vnode_pager_setsize().
57 #include <sys/param.h>
58 #include <sys/systm.h>
59 #include <sys/buf.h>
60 #include <sys/conf.h>
61 #include <sys/fcntl.h>
62 #include <sys/file.h>
63 #include <sys/kernel.h>
64 #include <sys/malloc.h>
65 #include <sys/mount.h>
66 #include <sys/proc.h>
67 #include <sys/socket.h>
68 #include <sys/stat.h>
69 #include <sys/sysctl.h>
70 #include <sys/unistd.h>
71 #include <sys/vmmeter.h>
72 #include <sys/vnode.h>
74 #include <machine/limits.h>
76 #include <vm/vm.h>
77 #include <vm/vm_object.h>
78 #include <vm/vm_extern.h>
79 #include <vm/vm_kern.h>
80 #include <vm/pmap.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vnode_pager.h>
85 #include <vm/vm_zone.h>
87 #include <sys/buf2.h>
88 #include <sys/thread2.h>
89 #include <sys/sysref2.h>
90 #include <sys/mplock2.h>
92 static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);
93 static int nvtruncbuf_bp_trunc(struct buf *bp, void *data);
94 static int nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);
95 static int nvtruncbuf_bp_metasync(struct buf *bp, void *data);
98 * Truncate a file's buffer and pages to a specified length. The
99 * byte-granular length of the file is specified along with the block
100 * size of the buffer containing that offset.
102 * If the last buffer straddles the length its contents will be zero-filled
103 * as appropriate. All buffers and pages after the last buffer will be
104 * destroyed. The last buffer itself will be destroyed only if the length
105 * is exactly aligned with it.
108 nvtruncbuf(struct vnode *vp, off_t length, int blksize)
110 off_t truncloffset;
111 off_t truncboffset;
112 const char *filename;
113 lwkt_tokref vlock;
114 struct buf *bp;
115 int count;
116 int boff;
117 int error;
120 * Round up to the *next* block, then destroy the buffers in question.
121 * Since we are only removing some of the buffers we must rely on the
122 * scan count to determine whether a loop is necessary.
124 * Destroy any pages beyond the last buffer.
126 boff = (int)(length % blksize);
127 if (boff)
128 truncloffset = length + (blksize - boff);
129 else
130 truncloffset = length;
132 lwkt_gettoken(&vlock, &vp->v_token);
133 do {
134 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
135 nvtruncbuf_bp_trunc_cmp,
136 nvtruncbuf_bp_trunc, &truncloffset);
137 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
138 nvtruncbuf_bp_trunc_cmp,
139 nvtruncbuf_bp_trunc, &truncloffset);
140 } while(count);
142 nvnode_pager_setsize(vp, length, blksize);
145 * Zero-fill the area beyond the file EOF that still fits within
146 * the last buffer. Even though we are modifying the contents
147 * of a buffer we are doing so beyond the file EOF and it doesn't
148 * count as a real modification.
150 * The VFS is responsible for dealing with the actual truncation.
152 if (boff) {
153 truncboffset = length - boff;
154 error = bread(vp, truncboffset, blksize, &bp);
155 if (error == 0) {
156 bzero(bp->b_data + boff, blksize - boff);
157 if (bp->b_flags & B_DELWRI) {
158 if (bp->b_dirtyoff > boff)
159 bp->b_dirtyoff = boff;
160 if (bp->b_dirtyend > boff)
161 bp->b_dirtyend = boff;
163 bqrelse(bp);
165 } else {
166 error = 0;
170 * For safety, fsync any remaining metadata if the file is not being
171 * truncated to 0. Since the metadata does not represent the entire
172 * dirty list we have to rely on the hit count to ensure that we get
173 * all of it.
175 if (length > 0) {
176 do {
177 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
178 nvtruncbuf_bp_metasync_cmp,
179 nvtruncbuf_bp_metasync, vp);
180 } while (count);
184 * It is possible to have in-progress I/O from buffers that were
185 * not part of the truncation. This should not happen if we
186 * are truncating to 0-length.
188 bio_track_wait(&vp->v_track_write, 0, 0);
191 * Debugging only
193 spin_lock_wr(&vp->v_spinlock);
194 filename = TAILQ_FIRST(&vp->v_namecache) ?
195 TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";
196 spin_unlock_wr(&vp->v_spinlock);
199 * Make sure no buffers were instantiated while we were trying
200 * to clean out the remaining VM pages. This could occur due
201 * to busy dirty VM pages being flushed out to disk.
203 do {
204 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
205 nvtruncbuf_bp_trunc_cmp,
206 nvtruncbuf_bp_trunc, &truncloffset);
207 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
208 nvtruncbuf_bp_trunc_cmp,
209 nvtruncbuf_bp_trunc, &truncloffset);
210 if (count) {
211 kprintf("Warning: vtruncbuf(): Had to re-clean %d "
212 "left over buffers in %s\n", count, filename);
214 } while(count);
216 lwkt_reltoken(&vlock);
218 return (error);
222 * The callback buffer is beyond the new file EOF and must be destroyed.
223 * Note that the compare function must conform to the RB_SCAN's requirements.
225 static
227 nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
229 if (bp->b_loffset >= *(off_t *)data)
230 return(0);
231 return(-1);
234 static
236 nvtruncbuf_bp_trunc(struct buf *bp, void *data)
239 * Do not try to use a buffer we cannot immediately lock, but sleep
240 * anyway to prevent a livelock. The code will loop until all buffers
241 * can be acted upon.
243 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
244 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
245 BUF_UNLOCK(bp);
246 } else {
247 bremfree(bp);
248 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
249 brelse(bp);
251 return(1);
255 * Fsync all meta-data after truncating a file to be non-zero. Only metadata
256 * blocks (with a negative loffset) are scanned.
257 * Note that the compare function must conform to the RB_SCAN's requirements.
259 static int
260 nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data)
262 if (bp->b_loffset < 0)
263 return(0);
264 return(1);
267 static int
268 nvtruncbuf_bp_metasync(struct buf *bp, void *data)
270 struct vnode *vp = data;
272 if (bp->b_flags & B_DELWRI) {
274 * Do not try to use a buffer we cannot immediately lock,
275 * but sleep anyway to prevent a livelock. The code will
276 * loop until all buffers can be acted upon.
278 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
279 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
280 BUF_UNLOCK(bp);
281 } else {
282 bremfree(bp);
283 if (bp->b_vp == vp)
284 bawrite(bp);
285 else
286 bwrite(bp);
288 return(1);
289 } else {
290 return(0);
295 * Extend a file's buffer and pages to a new, larger size. Note that the
296 * blocksize passed is for the buffer covering the old file size, NOT the
297 * new file size.
299 * To make this explicit we require the old length to passed even though
300 * we can acquire it from vp->v_filesize.
302 * If the caller intends to immediately write into the newly extended
303 * space pass trivial == 1. If trivial is 0 the original buffer will be
304 * zero-filled as necessary to clean out any junk in the extended space.
306 * NOTE: We do not zero-fill to the end of the buffer or page to remove
307 * mmap cruft since userland can just re-cruft it. Filesystems are
308 * responsible for zero-filling extra space beyond the file EOF during
309 * strategy write functions, or zero-filling junk areas on read.
312 nvextendbuf(struct vnode *vp, off_t olength, off_t nlength,
313 int oblksize, int nblksize, int trivial)
315 off_t truncboffset;
316 struct buf *bp;
317 int boff;
318 int error;
320 error = 0;
321 nvnode_pager_setsize(vp, nlength, nblksize);
322 if (trivial == 0) {
323 boff = (int)(olength % oblksize);
324 truncboffset = olength - boff;
326 if (boff) {
327 error = bread(vp, truncboffset, oblksize, &bp);
328 if (error == 0) {
329 bzero(bp->b_data + boff, oblksize - boff);
330 bqrelse(bp);
334 return (error);
338 * Set vp->v_filesize and vp->v_object->size, destroy pages beyond
339 * the last buffer when truncating.
341 * This function does not do any zeroing or invalidating of partially
342 * overlapping pages. Zeroing is the responsibility of nvtruncbuf().
343 * However, it does unmap VM pages from the user address space on a
344 * page-granular (verses buffer cache granular) basis.
346 void
347 nvnode_pager_setsize(struct vnode *vp, off_t length, int blksize)
349 vm_pindex_t nobjsize;
350 vm_pindex_t oobjsize;
351 vm_pindex_t pi;
352 vm_object_t object;
353 vm_page_t m;
354 off_t truncboffset;
355 int boff;
358 * Degenerate conditions
360 if ((object = vp->v_object) == NULL)
361 return;
362 if (length == vp->v_filesize)
363 return;
366 * Calculate the size of the VM object, coverage includes
367 * the buffer straddling EOF. If EOF is buffer-aligned
368 * we don't bother.
370 * Buffers do not have to be page-aligned. Make sure
371 * nobjsize is beyond the last page of the buffer.
373 boff = (int)(length % blksize);
374 truncboffset = length - boff;
375 oobjsize = object->size;
376 if (boff)
377 nobjsize = OFF_TO_IDX(truncboffset + blksize + PAGE_MASK);
378 else
379 nobjsize = OFF_TO_IDX(truncboffset + PAGE_MASK);
380 object->size = nobjsize;
382 if (length < vp->v_filesize) {
384 * File has shrunk, toss any cached pages beyond
385 * the end of the buffer (blksize aligned) for the
386 * new EOF.
388 vp->v_filesize = length;
389 if (nobjsize < oobjsize) {
390 vm_object_page_remove(object, nobjsize, oobjsize,
391 FALSE);
395 * Unmap any pages (page aligned) beyond the new EOF.
396 * The pages remain part of the (last) buffer and are not
397 * invalidated.
399 pi = OFF_TO_IDX(length + PAGE_MASK);
400 while (pi < nobjsize) {
401 do {
402 m = vm_page_lookup(object, pi);
403 } while (m && vm_page_sleep_busy(m, TRUE, "vsetsz"));
404 if (m) {
405 vm_page_busy(m);
406 vm_page_protect(m, VM_PROT_NONE);
407 vm_page_wakeup(m);
409 ++pi;
411 } else {
413 * File has expanded.
415 vp->v_filesize = length;