kernel - Fix excessive call stack depth on stuck interrupt
[dragonfly.git] / sys / kern / vfs_vm.c
blobe03790b0c0fc77c95befe83bb88c13c5599ebfa3
1 /*
2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
36 * Implements new VFS/VM coherency functions. For conforming VFSs
37 * we treat the backing VM object slightly differently. Instead of
38 * maintaining a number of pages to exactly fit the size of the file
39 * we instead maintain pages to fit the entire contents of the last
40 * buffer cache buffer used by the file.
42 * For VFSs like NFS and HAMMER which use (generally speaking) fixed
43 * sized buffers this greatly reduces the complexity of VFS/VM interactions.
45 * Truncations no longer invalidate pages covered by the buffer cache
46 * beyond the file EOF which still fit within the file's last buffer.
47 * We simply unmap them and do not allow userland to fault them in.
49 * The VFS is no longer responsible for zero-filling buffers during a
50 * truncation, the last buffer will be automatically zero-filled by
51 * nvtruncbuf().
53 * This code is intended to (eventually) replace vtruncbuf() and
54 * vnode_pager_setsize().
57 #include <sys/param.h>
58 #include <sys/systm.h>
59 #include <sys/buf.h>
60 #include <sys/conf.h>
61 #include <sys/fcntl.h>
62 #include <sys/file.h>
63 #include <sys/kernel.h>
64 #include <sys/malloc.h>
65 #include <sys/mount.h>
66 #include <sys/proc.h>
67 #include <sys/socket.h>
68 #include <sys/stat.h>
69 #include <sys/sysctl.h>
70 #include <sys/unistd.h>
71 #include <sys/vmmeter.h>
72 #include <sys/vnode.h>
74 #include <machine/limits.h>
76 #include <vm/vm.h>
77 #include <vm/vm_object.h>
78 #include <vm/vm_extern.h>
79 #include <vm/vm_kern.h>
80 #include <vm/pmap.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vnode_pager.h>
85 #include <vm/vm_zone.h>
87 #include <sys/buf2.h>
88 #include <sys/thread2.h>
89 #include <sys/sysref2.h>
90 #include <vm/vm_page2.h>
92 static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);
93 static int nvtruncbuf_bp_trunc(struct buf *bp, void *data);
94 static int nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);
95 static int nvtruncbuf_bp_metasync(struct buf *bp, void *data);
98 * Truncate a file's buffer and pages to a specified length. The
99 * byte-granular length of the file is specified along with the block
100 * size of the buffer containing that offset.
102 * If the last buffer straddles the length its contents will be zero-filled
103 * as appropriate. All buffers and pages after the last buffer will be
104 * destroyed. The last buffer itself will be destroyed only if the length
105 * is exactly aligned with it.
107 * UFS typically passes the old block size prior to the actual truncation,
108 * then later resizes the block based on the new file size. NFS uses a
109 * fixed block size and doesn't care. HAMMER uses a block size based on
110 * the offset which is fixed for any particular offset.
112 * When zero-filling we must bdwrite() to avoid a window of opportunity
113 * where the kernel might throw away a clean buffer and the filesystem
114 * then attempts to bread() it again before completing (or as part of)
115 * the extension. The filesystem is still responsible for zero-filling
116 * any remainder when writing to the media in the strategy function when
117 * it is able to do so without the page being mapped. The page may still
118 * be mapped by userland here.
120 * When modifying a buffer we must clear any cached raw disk offset.
121 * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER,
122 * never overwrite existing data blocks.
125 struct truncbuf_info {
126 struct vnode *vp;
127 off_t truncloffset; /* truncation point */
128 int clean; /* clean tree, else dirty tree */
132 nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff, int trivial)
134 struct truncbuf_info info;
135 off_t truncboffset;
136 const char *filename;
137 struct buf *bp;
138 int count;
139 int error;
142 * Round up to the *next* block, then destroy the buffers in question.
143 * Since we are only removing some of the buffers we must rely on the
144 * scan count to determine whether a loop is necessary.
146 * Destroy any pages beyond the last buffer.
148 if (boff < 0)
149 boff = (int)(length % blksize);
150 if (boff)
151 info.truncloffset = length + (blksize - boff);
152 else
153 info.truncloffset = length;
154 info.vp = vp;
155 lwkt_gettoken(&vp->v_token);
156 do {
157 info.clean = 1;
158 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
159 nvtruncbuf_bp_trunc_cmp,
160 nvtruncbuf_bp_trunc, &info);
161 info.clean = 0;
162 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
163 nvtruncbuf_bp_trunc_cmp,
164 nvtruncbuf_bp_trunc, &info);
165 } while(count);
167 nvnode_pager_setsize(vp, length, blksize, boff);
170 * Zero-fill the area beyond the file EOF that still fits within
171 * the last buffer. We must mark the buffer as dirty even though
172 * the modified area is beyond EOF to avoid races where the kernel
173 * might flush the buffer before the filesystem is able to reallocate
174 * the block.
176 * The VFS is responsible for dealing with the actual truncation.
178 * Only do this if trivial is zero, otherwise it is up to the
179 * VFS to handle the block straddling the EOF.
181 if (boff && trivial == 0) {
182 truncboffset = length - boff;
183 error = bread(vp, truncboffset, blksize, &bp);
184 if (error == 0) {
185 bzero(bp->b_data + boff, blksize - boff);
186 if (bp->b_flags & B_DELWRI) {
187 if (bp->b_dirtyoff > boff)
188 bp->b_dirtyoff = boff;
189 if (bp->b_dirtyend > boff)
190 bp->b_dirtyend = boff;
192 bp->b_bio2.bio_offset = NOOFFSET;
193 bdwrite(bp);
195 } else {
196 error = 0;
200 * For safety, fsync any remaining metadata if the file is not being
201 * truncated to 0. Since the metadata does not represent the entire
202 * dirty list we have to rely on the hit count to ensure that we get
203 * all of it.
205 * This is typically applicable only to UFS. NFS and HAMMER do
206 * not store indirect blocks in the per-vnode buffer cache.
208 if (length > 0) {
209 do {
210 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
211 nvtruncbuf_bp_metasync_cmp,
212 nvtruncbuf_bp_metasync, &info);
213 } while (count);
217 * It is possible to have in-progress I/O from buffers that were
218 * not part of the truncation. This should not happen if we
219 * are truncating to 0-length.
221 bio_track_wait(&vp->v_track_write, 0, 0);
224 * Debugging only
226 spin_lock(&vp->v_spin);
227 filename = TAILQ_FIRST(&vp->v_namecache) ?
228 TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";
229 spin_unlock(&vp->v_spin);
232 * Make sure no buffers were instantiated while we were trying
233 * to clean out the remaining VM pages. This could occur due
234 * to busy dirty VM pages being flushed out to disk.
236 do {
237 info.clean = 1;
238 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
239 nvtruncbuf_bp_trunc_cmp,
240 nvtruncbuf_bp_trunc, &info);
241 info.clean = 0;
242 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
243 nvtruncbuf_bp_trunc_cmp,
244 nvtruncbuf_bp_trunc, &info);
245 if (count) {
246 kprintf("Warning: vtruncbuf(): Had to re-clean %d "
247 "left over buffers in %s\n", count, filename);
249 } while(count);
251 lwkt_reltoken(&vp->v_token);
253 return (error);
257 * The callback buffer is beyond the new file EOF and must be destroyed.
258 * Note that the compare function must conform to the RB_SCAN's requirements.
260 static
262 nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
264 struct truncbuf_info *info = data;
266 if (bp->b_loffset >= info->truncloffset)
267 return(0);
268 return(-1);
271 static
273 nvtruncbuf_bp_trunc(struct buf *bp, void *data)
275 struct truncbuf_info *info = data;
278 * Do not try to use a buffer we cannot immediately lock,
279 * but sleep anyway to prevent a livelock. The code will
280 * loop until all buffers can be acted upon.
282 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
283 atomic_add_int(&bp->b_refs, 1);
284 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
285 BUF_UNLOCK(bp);
286 atomic_subtract_int(&bp->b_refs, 1);
287 } else if ((info->clean && (bp->b_flags & B_DELWRI)) ||
288 (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0) ||
289 bp->b_vp != info->vp ||
290 nvtruncbuf_bp_trunc_cmp(bp, data)) {
291 BUF_UNLOCK(bp);
292 } else {
293 bremfree(bp);
294 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
295 brelse(bp);
297 lwkt_yield();
298 return(1);
302 * Fsync all meta-data after truncating a file to be non-zero. Only metadata
303 * blocks (with a negative loffset) are scanned.
304 * Note that the compare function must conform to the RB_SCAN's requirements.
306 static int
307 nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data __unused)
309 if (bp->b_loffset < 0)
310 return(0);
311 lwkt_yield();
312 return(1);
315 static int
316 nvtruncbuf_bp_metasync(struct buf *bp, void *data)
318 struct truncbuf_info *info = data;
321 * Do not try to use a buffer we cannot immediately lock,
322 * but sleep anyway to prevent a livelock. The code will
323 * loop until all buffers can be acted upon.
325 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
326 atomic_add_int(&bp->b_refs, 1);
327 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
328 BUF_UNLOCK(bp);
329 atomic_subtract_int(&bp->b_refs, 1);
330 } else if ((bp->b_flags & B_DELWRI) == 0 ||
331 bp->b_vp != info->vp ||
332 nvtruncbuf_bp_metasync_cmp(bp, data)) {
333 BUF_UNLOCK(bp);
334 } else {
335 bremfree(bp);
336 bawrite(bp);
338 lwkt_yield();
339 return(1);
343 * Extend a file's buffer and pages to a new, larger size. The block size
344 * at both the old and new length must be passed, but buffer cache operations
345 * will only be performed on the old block. The new nlength/nblksize will
346 * be used to properly set the VM object size.
348 * To make this explicit we require the old length to passed even though
349 * we can acquire it from vp->v_filesize, which also avoids potential
350 * corruption if the filesystem and vp get desynchronized somehow.
352 * If the caller intends to immediately write into the newly extended
353 * space pass trivial == 1. If trivial is 0 the original buffer will be
354 * zero-filled as necessary to clean out any junk in the extended space.
355 * If non-zero the original buffer (straddling EOF) is not touched.
357 * When zero-filling we must bdwrite() to avoid a window of opportunity
358 * where the kernel might throw away a clean buffer and the filesystem
359 * then attempts to bread() it again before completing (or as part of)
360 * the extension. The filesystem is still responsible for zero-filling
361 * any remainder when writing to the media in the strategy function when
362 * it is able to do so without the page being mapped. The page may still
363 * be mapped by userland here.
365 * When modifying a buffer we must clear any cached raw disk offset.
366 * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER,
367 * never overwrite existing data blocks.
370 nvextendbuf(struct vnode *vp, off_t olength, off_t nlength,
371 int oblksize, int nblksize, int oboff, int nboff, int trivial)
373 off_t truncboffset;
374 struct buf *bp;
375 int error;
377 error = 0;
378 nvnode_pager_setsize(vp, nlength, nblksize, nboff);
379 if (trivial == 0) {
380 if (oboff < 0)
381 oboff = (int)(olength % oblksize);
382 truncboffset = olength - oboff;
384 if (oboff) {
385 error = bread(vp, truncboffset, oblksize, &bp);
386 if (error == 0) {
387 bzero(bp->b_data + oboff, oblksize - oboff);
388 bp->b_bio2.bio_offset = NOOFFSET;
389 bdwrite(bp);
390 } else {
391 kprintf("nvextendbuf: bread EOF @ %016jx "
392 "error %d\n",
393 truncboffset, error);
394 bp->b_flags |= B_INVAL | B_RELBUF;
395 brelse(bp);
399 return (error);
403 * Set vp->v_filesize and vp->v_object->size, destroy pages beyond
404 * the last buffer when truncating.
406 * This function does not do any zeroing or invalidating of partially
407 * overlapping pages. Zeroing is the responsibility of nvtruncbuf().
408 * However, it does unmap VM pages from the user address space on a
409 * page-granular (verses buffer cache granular) basis.
411 * If boff is passed as -1 the base offset of the buffer cache buffer is
412 * calculated from length and blksize. Filesystems such as UFS which deal
413 * with fragments have to specify a boff >= 0 since the base offset cannot
414 * be calculated from length and blksize.
416 * For UFS blksize is the 'new' blocksize, used only to determine how large
417 * the VM object must become.
419 void
420 nvnode_pager_setsize(struct vnode *vp, off_t length, int blksize, int boff)
422 vm_pindex_t nobjsize;
423 vm_pindex_t oobjsize;
424 vm_pindex_t pi;
425 vm_object_t object;
426 vm_page_t m;
427 off_t truncboffset;
430 * Degenerate conditions
432 if ((object = vp->v_object) == NULL)
433 return;
434 vm_object_hold(object);
435 if (length == vp->v_filesize) {
436 vm_object_drop(object);
437 return;
441 * Calculate the size of the VM object, coverage includes
442 * the buffer straddling EOF. If EOF is buffer-aligned
443 * we don't bother.
445 * Buffers do not have to be page-aligned. Make sure
446 * nobjsize is beyond the last page of the buffer.
448 if (boff < 0)
449 boff = (int)(length % blksize);
450 truncboffset = length - boff;
451 oobjsize = object->size;
452 if (boff)
453 nobjsize = OFF_TO_IDX(truncboffset + blksize + PAGE_MASK);
454 else
455 nobjsize = OFF_TO_IDX(truncboffset + PAGE_MASK);
456 object->size = nobjsize;
458 if (length < vp->v_filesize) {
460 * File has shrunk, toss any cached pages beyond
461 * the end of the buffer (blksize aligned) for the
462 * new EOF.
464 vp->v_filesize = length;
465 if (nobjsize < oobjsize) {
466 vm_object_page_remove(object, nobjsize, oobjsize,
467 FALSE);
471 * Unmap any pages (page aligned) beyond the new EOF.
472 * The pages remain part of the (last) buffer and are not
473 * invalidated.
475 pi = OFF_TO_IDX(length + PAGE_MASK);
476 while (pi < nobjsize) {
477 m = vm_page_lookup_busy_wait(object, pi, FALSE, "vmpg");
478 if (m) {
479 vm_page_protect(m, VM_PROT_NONE);
480 vm_page_wakeup(m);
482 ++pi;
483 lwkt_yield();
485 } else {
487 * File has expanded.
489 vp->v_filesize = length;
491 vm_object_drop(object);