2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * Implements new VFS/VM coherency functions. For conforming VFSs
37 * we treat the backing VM object slightly differently. Instead of
38 * maintaining a number of pages to exactly fit the size of the file
39 * we instead maintain pages to fit the entire contents of the last
40 * buffer cache buffer used by the file.
42 * For VFSs like NFS and HAMMER which use (generally speaking) fixed
43 * sized buffers this greatly reduces the complexity of VFS/VM interactions.
45 * Truncations no longer invalidate pages covered by the buffer cache
46 * beyond the file EOF which still fit within the file's last buffer.
47 * We simply unmap them and do not allow userland to fault them in.
49 * The VFS is no longer responsible for zero-filling buffers during a
50 * truncation, the last buffer will be automatically zero-filled by
53 * This code is intended to (eventually) replace vtruncbuf() and
54 * vnode_pager_setsize().
57 #include <sys/param.h>
58 #include <sys/systm.h>
61 #include <sys/fcntl.h>
63 #include <sys/kernel.h>
64 #include <sys/malloc.h>
65 #include <sys/mount.h>
67 #include <sys/socket.h>
69 #include <sys/sysctl.h>
70 #include <sys/unistd.h>
71 #include <sys/vmmeter.h>
72 #include <sys/vnode.h>
74 #include <machine/limits.h>
77 #include <vm/vm_object.h>
78 #include <vm/vm_extern.h>
79 #include <vm/vm_kern.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_pager.h>
84 #include <vm/vnode_pager.h>
85 #include <vm/vm_zone.h>
88 #include <vm/vm_page2.h>
90 static int nvtruncbuf_bp_trunc_cmp(struct buf
*bp
, void *data
);
91 static int nvtruncbuf_bp_trunc(struct buf
*bp
, void *data
);
92 static int nvtruncbuf_bp_metasync_cmp(struct buf
*bp
, void *data
);
93 static int nvtruncbuf_bp_metasync(struct buf
*bp
, void *data
);
96 * Truncate a file's buffer and pages to a specified length. The
97 * byte-granular length of the file is specified along with the block
98 * size of the buffer containing that offset.
100 * If the last buffer straddles the length its contents will be zero-filled
101 * as appropriate. All buffers and pages after the last buffer will be
102 * destroyed. The last buffer itself will be destroyed only if the length
103 * is exactly aligned with it.
105 * UFS typically passes the old block size prior to the actual truncation,
106 * then later resizes the block based on the new file size. NFS uses a
107 * fixed block size and doesn't care. HAMMER uses a block size based on
108 * the offset which is fixed for any particular offset.
110 * When zero-filling we must bdwrite() to avoid a window of opportunity
111 * where the kernel might throw away a clean buffer and the filesystem
112 * then attempts to bread() it again before completing (or as part of)
113 * the extension. The filesystem is still responsible for zero-filling
114 * any remainder when writing to the media in the strategy function when
115 * it is able to do so without the page being mapped. The page may still
116 * be mapped by userland here.
118 * When modifying a buffer we must clear any cached raw disk offset.
119 * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER,
120 * never overwrite existing data blocks.
123 struct truncbuf_info
{
125 off_t truncloffset
; /* truncation point */
126 int clean
; /* clean tree, else dirty tree */
130 nvtruncbuf(struct vnode
*vp
, off_t length
, int blksize
, int boff
, int flags
)
132 struct truncbuf_info info
;
134 const char *filename
;
140 * Round up to the *next* block, then destroy the buffers in question.
141 * Since we are only removing some of the buffers we must rely on the
142 * scan count to determine whether a loop is necessary.
144 * Destroy any pages beyond the last buffer.
147 boff
= (int)(length
% blksize
);
149 info
.truncloffset
= length
+ (blksize
- boff
);
151 info
.truncloffset
= length
;
153 lwkt_gettoken(&vp
->v_token
);
156 count
= RB_SCAN(buf_rb_tree
, &vp
->v_rbclean_tree
,
157 nvtruncbuf_bp_trunc_cmp
,
158 nvtruncbuf_bp_trunc
, &info
);
160 count
+= RB_SCAN(buf_rb_tree
, &vp
->v_rbdirty_tree
,
161 nvtruncbuf_bp_trunc_cmp
,
162 nvtruncbuf_bp_trunc
, &info
);
165 nvnode_pager_setsize(vp
, length
, blksize
, boff
);
168 * Zero-fill the area beyond the file EOF that still fits within
169 * the last buffer. We must mark the buffer as dirty even though
170 * the modified area is beyond EOF to avoid races where the kernel
171 * might flush the buffer before the filesystem is able to reallocate
174 * The VFS is responsible for dealing with the actual truncation.
176 * Only do this if NVEXTF_TRIVIAL is not set, otherwise it is up to
177 * the VFS to handle the block straddling the EOF.
179 if (boff
&& (flags
& NVEXTF_TRIVIAL
) == 0) {
180 truncboffset
= length
- boff
;
181 error
= bread_kvabio(vp
, truncboffset
, blksize
, &bp
);
184 bzero(bp
->b_data
+ boff
, blksize
- boff
);
185 if (bp
->b_flags
& B_DELWRI
) {
186 if (bp
->b_dirtyoff
> boff
)
187 bp
->b_dirtyoff
= boff
;
188 if (bp
->b_dirtyend
> boff
)
189 bp
->b_dirtyend
= boff
;
191 bp
->b_bio2
.bio_offset
= NOOFFSET
;
192 if (flags
& NVEXTF_BUWRITE
)
197 kprintf("nvtruncbuf: bread error %d @0x%016jx\n",
198 error
, truncboffset
);
199 bp
->b_flags
|= B_INVAL
| B_RELBUF
;
207 * For safety, fsync any remaining metadata if the file is not being
208 * truncated to 0. Since the metadata does not represent the entire
209 * dirty list we have to rely on the hit count to ensure that we get
212 * This is typically applicable only to UFS. NFS and HAMMER do
213 * not store indirect blocks in the per-vnode buffer cache.
217 count
= RB_SCAN(buf_rb_tree
, &vp
->v_rbdirty_tree
,
218 nvtruncbuf_bp_metasync_cmp
,
219 nvtruncbuf_bp_metasync
, &info
);
224 * It is possible to have in-progress I/O from buffers that were
225 * not part of the truncation. This should not happen if we
226 * are truncating to 0-length.
228 bio_track_wait(&vp
->v_track_write
, 0, 0);
233 spin_lock(&vp
->v_spin
);
234 filename
= TAILQ_FIRST(&vp
->v_namecache
) ?
235 TAILQ_FIRST(&vp
->v_namecache
)->nc_name
: "?";
236 spin_unlock(&vp
->v_spin
);
239 * Make sure no buffers were instantiated while we were trying
240 * to clean out the remaining VM pages. This could occur due
241 * to busy dirty VM pages being flushed out to disk.
245 count
= RB_SCAN(buf_rb_tree
, &vp
->v_rbclean_tree
,
246 nvtruncbuf_bp_trunc_cmp
,
247 nvtruncbuf_bp_trunc
, &info
);
249 count
+= RB_SCAN(buf_rb_tree
, &vp
->v_rbdirty_tree
,
250 nvtruncbuf_bp_trunc_cmp
,
251 nvtruncbuf_bp_trunc
, &info
);
253 kprintf("Warning: vtruncbuf(): Had to re-clean %d "
254 "left over buffers in %s\n", count
, filename
);
258 lwkt_reltoken(&vp
->v_token
);
264 * The callback buffer is beyond the new file EOF and must be destroyed.
265 * Note that the compare function must conform to the RB_SCAN's requirements.
269 nvtruncbuf_bp_trunc_cmp(struct buf
*bp
, void *data
)
271 struct truncbuf_info
*info
= data
;
273 if (bp
->b_loffset
>= info
->truncloffset
)
280 nvtruncbuf_bp_trunc(struct buf
*bp
, void *data
)
282 struct truncbuf_info
*info
= data
;
285 * Do not try to use a buffer we cannot immediately lock,
286 * but sleep anyway to prevent a livelock. The code will
287 * loop until all buffers can be acted upon.
289 if (BUF_LOCK(bp
, LK_EXCLUSIVE
| LK_NOWAIT
)) {
290 atomic_add_int(&bp
->b_refs
, 1);
291 if (BUF_LOCK(bp
, LK_EXCLUSIVE
|LK_SLEEPFAIL
) == 0)
293 atomic_subtract_int(&bp
->b_refs
, 1);
294 } else if ((info
->clean
&& (bp
->b_flags
& B_DELWRI
)) ||
295 (info
->clean
== 0 && (bp
->b_flags
& B_DELWRI
) == 0) ||
296 bp
->b_vp
!= info
->vp
||
297 nvtruncbuf_bp_trunc_cmp(bp
, data
)) {
301 bp
->b_flags
|= (B_INVAL
| B_RELBUF
| B_NOCACHE
);
309 * Fsync all meta-data after truncating a file to be non-zero. Only metadata
310 * blocks (with a negative loffset) are scanned.
311 * Note that the compare function must conform to the RB_SCAN's requirements.
314 nvtruncbuf_bp_metasync_cmp(struct buf
*bp
, void *data __unused
)
316 if (bp
->b_loffset
< 0)
323 nvtruncbuf_bp_metasync(struct buf
*bp
, void *data
)
325 struct truncbuf_info
*info
= data
;
328 * Do not try to use a buffer we cannot immediately lock,
329 * but sleep anyway to prevent a livelock. The code will
330 * loop until all buffers can be acted upon.
332 if (BUF_LOCK(bp
, LK_EXCLUSIVE
| LK_NOWAIT
)) {
333 atomic_add_int(&bp
->b_refs
, 1);
334 if (BUF_LOCK(bp
, LK_EXCLUSIVE
|LK_SLEEPFAIL
) == 0)
336 atomic_subtract_int(&bp
->b_refs
, 1);
337 } else if ((bp
->b_flags
& B_DELWRI
) == 0 ||
338 bp
->b_vp
!= info
->vp
||
339 nvtruncbuf_bp_metasync_cmp(bp
, data
)) {
350 * Extend a file's buffer and pages to a new, larger size. The block size
351 * at both the old and new length must be passed, but buffer cache operations
352 * will only be performed on the old block. The new nlength/nblksize will
353 * be used to properly set the VM object size.
355 * To make this explicit we require the old length to passed even though
356 * we can acquire it from vp->v_filesize, which also avoids potential
357 * corruption if the filesystem and vp get desynchronized somehow.
359 * If the caller intends to immediately write into the newly extended
360 * space pass NVEXTF_TRIVIAL. If not set, the original buffer will be
361 * zero-filled as necessary to clean out any junk in the extended space.
362 * If non-zero the original buffer (straddling EOF) is not touched.
364 * When zero-filling we must bdwrite() to avoid a window of opportunity
365 * where the kernel might throw away a clean buffer and the filesystem
366 * then attempts to bread() it again before completing (or as part of)
367 * the extension. The filesystem is still responsible for zero-filling
368 * any remainder when writing to the media in the strategy function when
369 * it is able to do so without the page being mapped. The page may still
370 * be mapped by userland here.
372 * When modifying a buffer we must clear any cached raw disk offset.
373 * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER,
374 * never overwrite existing data blocks.
377 nvextendbuf(struct vnode
*vp
, off_t olength
, off_t nlength
,
378 int oblksize
, int nblksize
, int oboff
, int nboff
, int flags
)
385 nvnode_pager_setsize(vp
, nlength
, nblksize
, nboff
);
386 if ((flags
& NVEXTF_TRIVIAL
) == 0) {
388 oboff
= (int)(olength
% oblksize
);
389 truncboffset
= olength
- oboff
;
392 error
= bread_kvabio(vp
, truncboffset
, oblksize
, &bp
);
395 bzero(bp
->b_data
+ oboff
, oblksize
- oboff
);
396 bp
->b_bio2
.bio_offset
= NOOFFSET
;
397 if (flags
& NVEXTF_BUWRITE
)
402 kprintf("nvextendbuf: bread EOF @ %016jx "
404 truncboffset
, error
);
405 bp
->b_flags
|= B_INVAL
| B_RELBUF
;
414 * Set vp->v_filesize and vp->v_object->size, destroy pages beyond
415 * the last buffer when truncating.
417 * This function does not do any zeroing or invalidating of partially
418 * overlapping pages. Zeroing is the responsibility of nvtruncbuf().
419 * However, it does unmap VM pages from the user address space on a
420 * page-granular (verses buffer cache granular) basis.
422 * If boff is passed as -1 the base offset of the buffer cache buffer is
423 * calculated from length and blksize. Filesystems such as UFS which deal
424 * with fragments have to specify a boff >= 0 since the base offset cannot
425 * be calculated from length and blksize.
427 * For UFS blksize is the 'new' blocksize, used only to determine how large
428 * the VM object must become.
431 nvnode_pager_setsize(struct vnode
*vp
, off_t length
, int blksize
, int boff
)
433 vm_pindex_t nobjsize
;
434 vm_pindex_t oobjsize
;
441 * Degenerate conditions
443 if ((object
= vp
->v_object
) == NULL
)
445 vm_object_hold(object
);
446 if (length
== vp
->v_filesize
) {
447 vm_object_drop(object
);
452 * Calculate the size of the VM object, coverage includes
453 * the buffer straddling EOF. If EOF is buffer-aligned
456 * Buffers do not have to be page-aligned. Make sure
457 * nobjsize is beyond the last page of the buffer.
460 boff
= (int)(length
% blksize
);
461 truncboffset
= length
- boff
;
462 oobjsize
= object
->size
;
464 nobjsize
= OFF_TO_IDX(truncboffset
+ blksize
+ PAGE_MASK
);
466 nobjsize
= OFF_TO_IDX(truncboffset
+ PAGE_MASK
);
467 object
->size
= nobjsize
;
469 if (length
< vp
->v_filesize
) {
471 * File has shrunk, toss any cached pages beyond
472 * the end of the buffer (blksize aligned) for the
475 vp
->v_filesize
= length
;
476 if (nobjsize
< oobjsize
) {
477 vm_object_page_remove(object
, nobjsize
, oobjsize
,
482 * Unmap any pages (page aligned) beyond the new EOF.
483 * The pages remain part of the (last) buffer and are not
486 pi
= OFF_TO_IDX(length
+ PAGE_MASK
);
487 while (pi
< nobjsize
) {
488 m
= vm_page_lookup_busy_wait(object
, pi
, FALSE
, "vmpg");
490 vm_page_protect(m
, VM_PROT_NONE
);
500 vp
->v_filesize
= length
;
502 vm_object_drop(object
);