2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
37 * $FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_bio.c,v 1.130 2004/04/14 23:23:55 peadar Exp $
38 * $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.43 2008/04/22 18:46:53 dillon Exp $
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/resourcevar.h>
45 #include <sys/signalvar.h>
48 #include <sys/vnode.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
52 #include <sys/msfbuf.h>
55 #include <vm/vm_extern.h>
56 #include <vm/vm_page.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_pager.h>
59 #include <vm/vnode_pager.h>
61 #include <sys/thread2.h>
69 static struct buf
*nfs_getcacheblk(struct vnode
*vp
, off_t loffset
,
70 int size
, struct thread
*td
);
71 static int nfs_check_dirent(struct nfs_dirent
*dp
, int maxlen
);
73 extern int nfs_numasync
;
74 extern int nfs_pbuf_freecnt
;
75 extern struct nfsstats nfsstats
;
78 * Vnode op for VM getpages.
80 * nfs_getpages(struct vnode *a_vp, vm_page_t *a_m, int a_count,
81 * int a_reqpage, vm_ooffset_t a_offset)
84 nfs_getpages(struct vop_getpages_args
*ap
)
86 struct thread
*td
= curthread
; /* XXX */
87 int i
, error
, nextoff
, size
, toff
, count
, npages
;
98 nmp
= VFSTONFS(vp
->v_mount
);
102 if (vp
->v_object
== NULL
) {
103 kprintf("nfs_getpages: called with non-merged cache vnode??\n");
104 return VM_PAGER_ERROR
;
107 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) != 0 &&
108 (nmp
->nm_state
& NFSSTA_GOTFSINFO
) == 0)
109 (void)nfs_fsinfo(nmp
, vp
, td
);
111 npages
= btoc(count
);
114 * NOTE that partially valid pages may occur in cases other
115 * then file EOF, such as when a file is partially written and
116 * ftruncate()-extended to a larger size. It is also possible
117 * for the valid bits to be set on garbage beyond the file EOF and
118 * clear in the area before EOF (e.g. m->valid == 0xfc), which can
119 * occur due to vtruncbuf() and the buffer cache's handling of
120 * pages which 'straddle' buffers or when b_bufsize is not a
121 * multiple of PAGE_SIZE.... the buffer cache cannot normally
122 * clear the extra bits. This kind of situation occurs when you
123 * make a small write() (m->valid == 0x03) and then mmap() and
124 * fault in the buffer(m->valid = 0xFF). When NFS flushes the
125 * buffer (vinvalbuf() m->valid = 0xFC) we are left with a mess.
127 * This is combined with the possibility that the pages are partially
128 * dirty or that there is a buffer backing the pages that is dirty
129 * (even if m->dirty is 0).
131 * To solve this problem several hacks have been made: (1) NFS
132 * guarentees that the IO block size is a multiple of PAGE_SIZE and
133 * (2) The buffer cache, when invalidating an NFS buffer, will
134 * disregard the buffer's fragmentory b_bufsize and invalidate
135 * the whole page rather then just the piece the buffer owns.
137 * This allows us to assume that a partially valid page found here
138 * is fully valid (vm_fault will zero'd out areas of the page not
141 m
= pages
[ap
->a_reqpage
];
143 for (i
= 0; i
< npages
; ++i
) {
144 if (i
!= ap
->a_reqpage
)
145 vnode_pager_freepage(pages
[i
]);
151 * Use an MSF_BUF as a medium to retrieve data from the pages.
153 msf_map_pagelist(&msf
, pages
, npages
, 0);
155 kva
= msf_buf_kva(msf
);
161 uio
.uio_offset
= IDX_TO_OFF(pages
[0]->pindex
);
162 uio
.uio_resid
= count
;
163 uio
.uio_segflg
= UIO_SYSSPACE
;
164 uio
.uio_rw
= UIO_READ
;
167 error
= nfs_readrpc(vp
, &uio
);
170 if (error
&& (uio
.uio_resid
== count
)) {
171 kprintf("nfs_getpages: error %d\n", error
);
172 for (i
= 0; i
< npages
; ++i
) {
173 if (i
!= ap
->a_reqpage
)
174 vnode_pager_freepage(pages
[i
]);
176 return VM_PAGER_ERROR
;
180 * Calculate the number of bytes read and validate only that number
181 * of bytes. Note that due to pending writes, size may be 0. This
182 * does not mean that the remaining data is invalid!
185 size
= count
- uio
.uio_resid
;
187 for (i
= 0, toff
= 0; i
< npages
; i
++, toff
= nextoff
) {
188 nextoff
= toff
+ PAGE_SIZE
;
191 m
->flags
&= ~PG_ZERO
;
193 if (nextoff
<= size
) {
195 * Read operation filled an entire page
197 m
->valid
= VM_PAGE_BITS_ALL
;
199 } else if (size
> toff
) {
201 * Read operation filled a partial page.
204 vm_page_set_validclean(m
, 0, size
- toff
);
205 /* handled by vm_fault now */
206 /* vm_page_zero_invalid(m, TRUE); */
209 * Read operation was short. If no error occured
210 * we may have hit a zero-fill section. We simply
211 * leave valid set to 0.
215 if (i
!= ap
->a_reqpage
) {
217 * Whether or not to leave the page activated is up in
218 * the air, but we should put the page on a page queue
219 * somewhere (it already is in the object). Result:
220 * It appears that emperical results show that
221 * deactivating pages is best.
225 * Just in case someone was asking for this page we
226 * now tell them that it is ok to use.
229 if (m
->flags
& PG_WANTED
)
232 vm_page_deactivate(m
);
235 vnode_pager_freepage(m
);
243 * Vnode op for VM putpages.
245 * nfs_putpages(struct vnode *a_vp, vm_page_t *a_m, int a_count, int a_sync,
246 * int *a_rtvals, vm_ooffset_t a_offset)
249 nfs_putpages(struct vop_putpages_args
*ap
)
251 struct thread
*td
= curthread
;
255 int iomode
, must_commit
, i
, error
, npages
, count
;
259 struct nfsmount
*nmp
;
266 nmp
= VFSTONFS(vp
->v_mount
);
269 rtvals
= ap
->a_rtvals
;
270 npages
= btoc(count
);
271 offset
= IDX_TO_OFF(pages
[0]->pindex
);
273 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) != 0 &&
274 (nmp
->nm_state
& NFSSTA_GOTFSINFO
) == 0)
275 (void)nfs_fsinfo(nmp
, vp
, td
);
277 for (i
= 0; i
< npages
; i
++) {
278 rtvals
[i
] = VM_PAGER_AGAIN
;
282 * When putting pages, do not extend file past EOF.
285 if (offset
+ count
> np
->n_size
) {
286 count
= np
->n_size
- offset
;
292 * Use an MSF_BUF as a medium to retrieve data from the pages.
294 msf_map_pagelist(&msf
, pages
, npages
, 0);
296 kva
= msf_buf_kva(msf
);
302 uio
.uio_offset
= offset
;
303 uio
.uio_resid
= count
;
304 uio
.uio_segflg
= UIO_SYSSPACE
;
305 uio
.uio_rw
= UIO_WRITE
;
308 if ((ap
->a_sync
& VM_PAGER_PUT_SYNC
) == 0)
309 iomode
= NFSV3WRITE_UNSTABLE
;
311 iomode
= NFSV3WRITE_FILESYNC
;
313 error
= nfs_writerpc(vp
, &uio
, &iomode
, &must_commit
);
318 int nwritten
= round_page(count
- uio
.uio_resid
) / PAGE_SIZE
;
319 for (i
= 0; i
< nwritten
; i
++) {
320 rtvals
[i
] = VM_PAGER_OK
;
321 vm_page_undirty(pages
[i
]);
324 nfs_clearcommit(vp
->v_mount
);
330 * Vnode op for read using bio
333 nfs_bioread(struct vnode
*vp
, struct uio
*uio
, int ioflag
)
335 struct nfsnode
*np
= VTONFS(vp
);
337 struct buf
*bp
= 0, *rabp
;
340 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
346 int nra
, error
= 0, n
= 0, on
= 0;
349 if (uio
->uio_rw
!= UIO_READ
)
350 panic("nfs_read mode");
352 if (uio
->uio_resid
== 0)
354 if (uio
->uio_offset
< 0) /* XXX VDIR cookies can be negative */
358 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) != 0 &&
359 (nmp
->nm_state
& NFSSTA_GOTFSINFO
) == 0)
360 (void)nfs_fsinfo(nmp
, vp
, td
);
361 if (vp
->v_type
!= VDIR
&&
362 (uio
->uio_offset
+ uio
->uio_resid
) > nmp
->nm_maxfilesize
)
364 biosize
= vp
->v_mount
->mnt_stat
.f_iosize
;
365 seqcount
= (int)((off_t
)(ioflag
>> IO_SEQSHIFT
) * biosize
/ BKVASIZE
);
368 * For nfs, cache consistency can only be maintained approximately.
369 * Although RFC1094 does not specify the criteria, the following is
370 * believed to be compatible with the reference port.
372 * NFS: If local changes have been made and this is a
373 * directory, the directory must be invalidated and
374 * the attribute cache must be cleared.
376 * GETATTR is called to synchronize the file size.
378 * If remote changes are detected local data is flushed
379 * and the cache is invalidated.
381 * NOTE: In the normal case the attribute cache is not
382 * cleared which means GETATTR may use cached data and
383 * not immediately detect changes made on the server.
385 if ((np
->n_flag
& NLMODIFIED
) && vp
->v_type
== VDIR
) {
387 error
= nfs_vinvalbuf(vp
, V_SAVE
, 1);
392 error
= VOP_GETATTR(vp
, &vattr
);
395 if (np
->n_flag
& NRMODIFIED
) {
396 if (vp
->v_type
== VDIR
)
398 error
= nfs_vinvalbuf(vp
, V_SAVE
, 1);
401 np
->n_flag
&= ~NRMODIFIED
;
404 if (np
->n_flag
& NDONTCACHE
) {
405 switch (vp
->v_type
) {
407 return (nfs_readrpc(vp
, uio
));
409 return (nfs_readlinkrpc(vp
, uio
));
413 kprintf(" NDONTCACHE: type %x unexpected\n", vp
->v_type
);
417 switch (vp
->v_type
) {
419 nfsstats
.biocache_reads
++;
420 lbn
= uio
->uio_offset
/ biosize
;
421 on
= uio
->uio_offset
& (biosize
- 1);
422 loffset
= (off_t
)lbn
* biosize
;
425 * Start the read ahead(s), as required.
427 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0) {
428 for (nra
= 0; nra
< nmp
->nm_readahead
&& nra
< seqcount
&&
429 (off_t
)(lbn
+ 1 + nra
) * biosize
< np
->n_size
; nra
++) {
430 rabn
= lbn
+ 1 + nra
;
431 raoffset
= (off_t
)rabn
* biosize
;
432 if (!findblk(vp
, raoffset
)) {
433 rabp
= nfs_getcacheblk(vp
, raoffset
, biosize
, td
);
436 if ((rabp
->b_flags
& (B_CACHE
|B_DELWRI
)) == 0) {
437 rabp
->b_flags
|= B_ASYNC
;
438 rabp
->b_cmd
= BUF_CMD_READ
;
439 vfs_busy_pages(vp
, rabp
);
440 if (nfs_asyncio(vp
, &rabp
->b_bio2
, td
)) {
441 rabp
->b_flags
|= B_INVAL
|B_ERROR
;
442 vfs_unbusy_pages(rabp
);
454 * Obtain the buffer cache block. Figure out the buffer size
455 * when we are at EOF. If we are modifying the size of the
456 * buffer based on an EOF condition we need to hold
457 * nfs_rslock() through obtaining the buffer to prevent
458 * a potential writer-appender from messing with n_size.
459 * Otherwise we may accidently truncate the buffer and
462 * Note that bcount is *not* DEV_BSIZE aligned.
467 if (loffset
>= np
->n_size
) {
469 } else if (loffset
+ biosize
> np
->n_size
) {
470 bcount
= np
->n_size
- loffset
;
472 if (bcount
!= biosize
) {
473 switch(nfs_rslock(np
)) {
486 bp
= nfs_getcacheblk(vp
, loffset
, bcount
, td
);
488 if (bcount
!= biosize
)
494 * If B_CACHE is not set, we must issue the read. If this
495 * fails, we return an error.
498 if ((bp
->b_flags
& B_CACHE
) == 0) {
499 bp
->b_cmd
= BUF_CMD_READ
;
500 vfs_busy_pages(vp
, bp
);
501 error
= nfs_doio(vp
, &bp
->b_bio2
, td
);
509 * on is the offset into the current bp. Figure out how many
510 * bytes we can copy out of the bp. Note that bcount is
511 * NOT DEV_BSIZE aligned.
513 * Then figure out how many bytes we can copy into the uio.
518 n
= min((unsigned)(bcount
- on
), uio
->uio_resid
);
521 biosize
= min(NFS_MAXPATHLEN
, np
->n_size
);
522 nfsstats
.biocache_readlinks
++;
523 bp
= nfs_getcacheblk(vp
, (off_t
)0, biosize
, td
);
526 if ((bp
->b_flags
& B_CACHE
) == 0) {
527 bp
->b_cmd
= BUF_CMD_READ
;
528 vfs_busy_pages(vp
, bp
);
529 error
= nfs_doio(vp
, &bp
->b_bio2
, td
);
531 bp
->b_flags
|= B_ERROR
;
536 n
= min(uio
->uio_resid
, bp
->b_bcount
- bp
->b_resid
);
540 nfsstats
.biocache_readdirs
++;
541 if (np
->n_direofoffset
542 && uio
->uio_offset
>= np
->n_direofoffset
) {
545 lbn
= (uoff_t
)uio
->uio_offset
/ NFS_DIRBLKSIZ
;
546 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
547 loffset
= uio
->uio_offset
- on
;
548 bp
= nfs_getcacheblk(vp
, loffset
, NFS_DIRBLKSIZ
, td
);
552 if ((bp
->b_flags
& B_CACHE
) == 0) {
553 bp
->b_cmd
= BUF_CMD_READ
;
554 vfs_busy_pages(vp
, bp
);
555 error
= nfs_doio(vp
, &bp
->b_bio2
, td
);
559 while (error
== NFSERR_BAD_COOKIE
) {
560 kprintf("got bad cookie vp %p bp %p\n", vp
, bp
);
562 error
= nfs_vinvalbuf(vp
, 0, 1);
564 * Yuck! The directory has been modified on the
565 * server. The only way to get the block is by
566 * reading from the beginning to get all the
569 * Leave the last bp intact unless there is an error.
570 * Loop back up to the while if the error is another
571 * NFSERR_BAD_COOKIE (double yuch!).
573 for (i
= 0; i
<= lbn
&& !error
; i
++) {
574 if (np
->n_direofoffset
575 && (i
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
)
577 bp
= nfs_getcacheblk(vp
, (off_t
)i
* NFS_DIRBLKSIZ
,
581 if ((bp
->b_flags
& B_CACHE
) == 0) {
582 bp
->b_cmd
= BUF_CMD_READ
;
583 vfs_busy_pages(vp
, bp
);
584 error
= nfs_doio(vp
, &bp
->b_bio2
, td
);
586 * no error + B_INVAL == directory EOF,
589 if (error
== 0 && (bp
->b_flags
& B_INVAL
))
593 * An error will throw away the block and the
594 * for loop will break out. If no error and this
595 * is not the block we want, we throw away the
596 * block and go for the next one via the for loop.
598 if (error
|| i
< lbn
)
603 * The above while is repeated if we hit another cookie
604 * error. If we hit an error and it wasn't a cookie error,
612 * If not eof and read aheads are enabled, start one.
613 * (You need the current block first, so that you have the
614 * directory offset cookie of the next block.)
616 if (nfs_numasync
> 0 && nmp
->nm_readahead
> 0 &&
617 (bp
->b_flags
& B_INVAL
) == 0 &&
618 (np
->n_direofoffset
== 0 ||
619 loffset
+ NFS_DIRBLKSIZ
< np
->n_direofoffset
) &&
620 (np
->n_flag
& NDONTCACHE
) == 0 &&
621 !findblk(vp
, loffset
+ NFS_DIRBLKSIZ
)) {
622 rabp
= nfs_getcacheblk(vp
, loffset
+ NFS_DIRBLKSIZ
,
625 if ((rabp
->b_flags
& (B_CACHE
|B_DELWRI
)) == 0) {
626 rabp
->b_flags
|= B_ASYNC
;
627 rabp
->b_cmd
= BUF_CMD_READ
;
628 vfs_busy_pages(vp
, rabp
);
629 if (nfs_asyncio(vp
, &rabp
->b_bio2
, td
)) {
630 rabp
->b_flags
|= B_INVAL
|B_ERROR
;
631 vfs_unbusy_pages(rabp
);
640 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
641 * chopped for the EOF condition, we cannot tell how large
642 * NFS directories are going to be until we hit EOF. So
643 * an NFS directory buffer is *not* chopped to its EOF. Now,
644 * it just so happens that b_resid will effectively chop it
645 * to EOF. *BUT* this information is lost if the buffer goes
646 * away and is reconstituted into a B_CACHE state ( due to
647 * being VMIO ) later. So we keep track of the directory eof
648 * in np->n_direofoffset and chop it off as an extra step
651 n
= lmin(uio
->uio_resid
, NFS_DIRBLKSIZ
- bp
->b_resid
- on
);
652 if (np
->n_direofoffset
&& n
> np
->n_direofoffset
- uio
->uio_offset
)
653 n
= np
->n_direofoffset
- uio
->uio_offset
;
656 kprintf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
660 switch (vp
->v_type
) {
663 error
= uiomove(bp
->b_data
+ on
, (int)n
, uio
);
667 error
= uiomove(bp
->b_data
+ on
, (int)n
, uio
);
672 off_t old_off
= uio
->uio_offset
;
674 struct nfs_dirent
*dp
;
677 * We are casting cpos to nfs_dirent, it must be
685 cpos
= bp
->b_data
+ on
;
686 epos
= bp
->b_data
+ on
+ n
;
687 while (cpos
< epos
&& error
== 0 && uio
->uio_resid
> 0) {
688 dp
= (struct nfs_dirent
*)cpos
;
689 error
= nfs_check_dirent(dp
, (int)(epos
- cpos
));
692 if (vop_write_dirent(&error
, uio
, dp
->nfs_ino
,
693 dp
->nfs_type
, dp
->nfs_namlen
, dp
->nfs_name
)) {
696 cpos
+= dp
->nfs_reclen
;
700 uio
->uio_offset
= old_off
+ cpos
- bp
->b_data
- on
;
703 * Invalidate buffer if caching is disabled, forcing a
704 * re-read from the remote later.
706 if (np
->n_flag
& NDONTCACHE
)
707 bp
->b_flags
|= B_INVAL
;
710 kprintf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
713 } while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
718 * Userland can supply any 'seek' offset when reading a NFS directory.
719 * Validate the structure so we don't panic the kernel. Note that
720 * the element name is nul terminated and the nul is not included
725 nfs_check_dirent(struct nfs_dirent
*dp
, int maxlen
)
727 int nfs_name_off
= offsetof(struct nfs_dirent
, nfs_name
[0]);
729 if (nfs_name_off
>= maxlen
)
731 if (dp
->nfs_reclen
< nfs_name_off
|| dp
->nfs_reclen
> maxlen
)
733 if (nfs_name_off
+ dp
->nfs_namlen
>= dp
->nfs_reclen
)
735 if (dp
->nfs_reclen
& 3)
741 * Vnode op for write using bio
743 * nfs_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag,
744 * struct ucred *a_cred)
747 nfs_write(struct vop_write_args
*ap
)
749 struct uio
*uio
= ap
->a_uio
;
750 struct thread
*td
= uio
->uio_td
;
751 struct vnode
*vp
= ap
->a_vp
;
752 struct nfsnode
*np
= VTONFS(vp
);
753 int ioflag
= ap
->a_ioflag
;
756 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
759 int n
, on
, error
= 0, iomode
, must_commit
;
765 if (uio
->uio_rw
!= UIO_WRITE
)
766 panic("nfs_write mode");
767 if (uio
->uio_segflg
== UIO_USERSPACE
&& uio
->uio_td
!= curthread
)
768 panic("nfs_write proc");
770 if (vp
->v_type
!= VREG
)
772 if (np
->n_flag
& NWRITEERR
) {
773 np
->n_flag
&= ~NWRITEERR
;
774 return (np
->n_error
);
776 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) != 0 &&
777 (nmp
->nm_state
& NFSSTA_GOTFSINFO
) == 0)
778 (void)nfs_fsinfo(nmp
, vp
, td
);
781 * Synchronously flush pending buffers if we are in synchronous
782 * mode or if we are appending.
784 if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
785 if (np
->n_flag
& NLMODIFIED
) {
787 error
= nfs_flush(vp
, MNT_WAIT
, td
, 0);
788 /* error = nfs_vinvalbuf(vp, V_SAVE, 1); */
795 * If IO_APPEND then load uio_offset. We restart here if we cannot
796 * get the append lock.
799 if (ioflag
& IO_APPEND
) {
801 error
= VOP_GETATTR(vp
, &vattr
);
804 uio
->uio_offset
= np
->n_size
;
807 if (uio
->uio_offset
< 0)
809 if ((uio
->uio_offset
+ uio
->uio_resid
) > nmp
->nm_maxfilesize
)
811 if (uio
->uio_resid
== 0)
815 * We need to obtain the rslock if we intend to modify np->n_size
816 * in order to guarentee the append point with multiple contending
817 * writers, to guarentee that no other appenders modify n_size
818 * while we are trying to obtain a truncated buffer (i.e. to avoid
819 * accidently truncating data written by another appender due to
820 * the race), and to ensure that the buffer is populated prior to
821 * our extending of the file. We hold rslock through the entire
824 * Note that we do not synchronize the case where someone truncates
825 * the file while we are appending to it because attempting to lock
826 * this case may deadlock other parts of the system unexpectedly.
828 if ((ioflag
& IO_APPEND
) ||
829 uio
->uio_offset
+ uio
->uio_resid
> np
->n_size
) {
830 switch(nfs_rslock(np
)) {
845 * Maybe this should be above the vnode op call, but so long as
846 * file servers have no limits, i don't think it matters
848 if (td
->td_proc
&& uio
->uio_offset
+ uio
->uio_resid
>
849 td
->td_proc
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
850 lwpsignal(td
->td_proc
, td
->td_lwp
, SIGXFSZ
);
856 biosize
= vp
->v_mount
->mnt_stat
.f_iosize
;
859 if ((np
->n_flag
& NDONTCACHE
) && uio
->uio_iovcnt
== 1) {
860 iomode
= NFSV3WRITE_FILESYNC
;
861 error
= nfs_writerpc(vp
, uio
, &iomode
, &must_commit
);
863 nfs_clearcommit(vp
->v_mount
);
866 nfsstats
.biocache_writes
++;
867 lbn
= uio
->uio_offset
/ biosize
;
868 on
= uio
->uio_offset
& (biosize
-1);
869 loffset
= uio
->uio_offset
- on
;
870 n
= min((unsigned)(biosize
- on
), uio
->uio_resid
);
873 * Handle direct append and file extension cases, calculate
874 * unaligned buffer size.
877 if (uio
->uio_offset
== np
->n_size
&& n
) {
879 * Get the buffer (in its pre-append state to maintain
880 * B_CACHE if it was previously set). Resize the
881 * nfsnode after we have locked the buffer to prevent
882 * readers from reading garbage.
885 bp
= nfs_getcacheblk(vp
, loffset
, bcount
, td
);
890 np
->n_size
= uio
->uio_offset
+ n
;
891 np
->n_flag
|= NLMODIFIED
;
892 vnode_pager_setsize(vp
, np
->n_size
);
894 save
= bp
->b_flags
& B_CACHE
;
896 allocbuf(bp
, bcount
);
901 * Obtain the locked cache block first, and then
902 * adjust the file's size as appropriate.
905 if (loffset
+ bcount
< np
->n_size
) {
906 if (loffset
+ biosize
< np
->n_size
)
909 bcount
= np
->n_size
- loffset
;
911 bp
= nfs_getcacheblk(vp
, loffset
, bcount
, td
);
912 if (uio
->uio_offset
+ n
> np
->n_size
) {
913 np
->n_size
= uio
->uio_offset
+ n
;
914 np
->n_flag
|= NLMODIFIED
;
915 vnode_pager_setsize(vp
, np
->n_size
);
925 * Issue a READ if B_CACHE is not set. In special-append
926 * mode, B_CACHE is based on the buffer prior to the write
927 * op and is typically set, avoiding the read. If a read
928 * is required in special append mode, the server will
929 * probably send us a short-read since we extended the file
930 * on our end, resulting in b_resid == 0 and, thusly,
931 * B_CACHE getting set.
933 * We can also avoid issuing the read if the write covers
934 * the entire buffer. We have to make sure the buffer state
935 * is reasonable in this case since we will not be initiating
936 * I/O. See the comments in kern/vfs_bio.c's getblk() for
939 * B_CACHE may also be set due to the buffer being cached
942 * When doing a UIO_NOCOPY write the buffer is not
943 * overwritten and we cannot just set B_CACHE unconditionally
944 * for full-block writes.
947 if (on
== 0 && n
== bcount
&& uio
->uio_segflg
!= UIO_NOCOPY
) {
948 bp
->b_flags
|= B_CACHE
;
949 bp
->b_flags
&= ~(B_ERROR
| B_INVAL
);
952 if ((bp
->b_flags
& B_CACHE
) == 0) {
953 bp
->b_cmd
= BUF_CMD_READ
;
954 vfs_busy_pages(vp
, bp
);
955 error
= nfs_doio(vp
, &bp
->b_bio2
, td
);
965 np
->n_flag
|= NLMODIFIED
;
968 * If dirtyend exceeds file size, chop it down. This should
969 * not normally occur but there is an append race where it
970 * might occur XXX, so we log it.
972 * If the chopping creates a reverse-indexed or degenerate
973 * situation with dirtyoff/end, we 0 both of them.
976 if (bp
->b_dirtyend
> bcount
) {
977 kprintf("NFS append race @%08llx:%d\n",
978 bp
->b_bio2
.bio_offset
,
979 bp
->b_dirtyend
- bcount
);
980 bp
->b_dirtyend
= bcount
;
983 if (bp
->b_dirtyoff
>= bp
->b_dirtyend
)
984 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
987 * If the new write will leave a contiguous dirty
988 * area, just update the b_dirtyoff and b_dirtyend,
989 * otherwise force a write rpc of the old dirty area.
991 * While it is possible to merge discontiguous writes due to
992 * our having a B_CACHE buffer ( and thus valid read data
993 * for the hole), we don't because it could lead to
994 * significant cache coherency problems with multiple clients,
995 * especially if locking is implemented later on.
997 * as an optimization we could theoretically maintain
998 * a linked list of discontinuous areas, but we would still
999 * have to commit them separately so there isn't much
1000 * advantage to it except perhaps a bit of asynchronization.
1003 if (bp
->b_dirtyend
> 0 &&
1004 (on
> bp
->b_dirtyend
|| (on
+ n
) < bp
->b_dirtyoff
)) {
1005 if (bwrite(bp
) == EINTR
) {
1012 error
= uiomove((char *)bp
->b_data
+ on
, n
, uio
);
1015 * Since this block is being modified, it must be written
1016 * again and not just committed. Since write clustering does
1017 * not work for the stage 1 data write, only the stage 2
1018 * commit rpc, we have to clear B_CLUSTEROK as well.
1020 bp
->b_flags
&= ~(B_NEEDCOMMIT
| B_CLUSTEROK
);
1023 bp
->b_flags
|= B_ERROR
;
1029 * Only update dirtyoff/dirtyend if not a degenerate
1033 if (bp
->b_dirtyend
> 0) {
1034 bp
->b_dirtyoff
= min(on
, bp
->b_dirtyoff
);
1035 bp
->b_dirtyend
= max((on
+ n
), bp
->b_dirtyend
);
1037 bp
->b_dirtyoff
= on
;
1038 bp
->b_dirtyend
= on
+ n
;
1040 vfs_bio_set_validclean(bp
, on
, n
);
1044 * If the lease is non-cachable or IO_SYNC do bwrite().
1046 * IO_INVAL appears to be unused. The idea appears to be
1047 * to turn off caching in this case. Very odd. XXX
1049 if ((np
->n_flag
& NDONTCACHE
) || (ioflag
& IO_SYNC
)) {
1050 if (ioflag
& IO_INVAL
)
1051 bp
->b_flags
|= B_NOCACHE
;
1055 if (np
->n_flag
& NDONTCACHE
) {
1056 error
= nfs_vinvalbuf(vp
, V_SAVE
, 1);
1060 } else if ((n
+ on
) == biosize
) {
1061 bp
->b_flags
|= B_ASYNC
;
1066 } while (uio
->uio_resid
> 0 && n
> 0);
1075 * Get an nfs cache block.
1077 * Allocate a new one if the block isn't currently in the cache
1078 * and return the block marked busy. If the calling process is
1079 * interrupted by a signal for an interruptible mount point, return
1082 * The caller must carefully deal with the possible B_INVAL state of
1083 * the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
1084 * indirectly), so synchronous reads can be issued without worrying about
1085 * the B_INVAL state. We have to be a little more careful when dealing
1086 * with writes (see comments in nfs_write()) when extending a file past
1090 nfs_getcacheblk(struct vnode
*vp
, off_t loffset
, int size
, struct thread
*td
)
1094 struct nfsmount
*nmp
;
1099 if (nmp
->nm_flag
& NFSMNT_INT
) {
1100 bp
= getblk(vp
, loffset
, size
, GETBLK_PCATCH
, 0);
1101 while (bp
== NULL
) {
1102 if (nfs_sigintr(nmp
, (struct nfsreq
*)0, td
))
1104 bp
= getblk(vp
, loffset
, size
, 0, 2 * hz
);
1107 bp
= getblk(vp
, loffset
, size
, 0, 0);
1111 * bio2, the 'device' layer. Since BIOs use 64 bit byte offsets
1112 * now, no translation is necessary.
1114 bp
->b_bio2
.bio_offset
= loffset
;
1119 * Flush and invalidate all dirty buffers. If another process is already
1120 * doing the flush, just wait for completion.
1123 nfs_vinvalbuf(struct vnode
*vp
, int flags
, int intrflg
)
1125 struct nfsnode
*np
= VTONFS(vp
);
1126 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
1127 int error
= 0, slpflag
, slptimeo
;
1128 thread_t td
= curthread
;
1130 if (vp
->v_flag
& VRECLAIMED
)
1133 if ((nmp
->nm_flag
& NFSMNT_INT
) == 0)
1143 * First wait for any other process doing a flush to complete.
1145 while (np
->n_flag
& NFLUSHINPROG
) {
1146 np
->n_flag
|= NFLUSHWANT
;
1147 error
= tsleep((caddr_t
)&np
->n_flag
, 0, "nfsvinval", slptimeo
);
1148 if (error
&& intrflg
&& nfs_sigintr(nmp
, NULL
, td
))
1153 * Now, flush as required.
1155 np
->n_flag
|= NFLUSHINPROG
;
1156 error
= vinvalbuf(vp
, flags
, slpflag
, 0);
1158 if (intrflg
&& nfs_sigintr(nmp
, NULL
, td
)) {
1159 np
->n_flag
&= ~NFLUSHINPROG
;
1160 if (np
->n_flag
& NFLUSHWANT
) {
1161 np
->n_flag
&= ~NFLUSHWANT
;
1162 wakeup((caddr_t
)&np
->n_flag
);
1166 error
= vinvalbuf(vp
, flags
, 0, slptimeo
);
1168 np
->n_flag
&= ~(NLMODIFIED
| NFLUSHINPROG
);
1169 if (np
->n_flag
& NFLUSHWANT
) {
1170 np
->n_flag
&= ~NFLUSHWANT
;
1171 wakeup((caddr_t
)&np
->n_flag
);
1177 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
1178 * This is mainly to avoid queueing async I/O requests when the nfsiods
1179 * are all hung on a dead server.
1181 * Note: nfs_asyncio() does not clear (B_ERROR|B_INVAL) but when the bp
1182 * is eventually dequeued by the async daemon, nfs_doio() *will*.
1185 nfs_asyncio(struct vnode
*vp
, struct bio
*bio
, struct thread
*td
)
1187 struct buf
*bp
= bio
->bio_buf
;
1188 struct nfsmount
*nmp
;
1196 * If no async daemons then return EIO to force caller to run the rpc
1199 if (nfs_numasync
== 0)
1202 KKASSERT(vp
->v_tag
== VT_NFS
);
1203 nmp
= VFSTONFS(vp
->v_mount
);
1206 * Commits are usually short and sweet so lets save some cpu and
1207 * leave the async daemons for more important rpc's (such as reads
1210 if (bp
->b_cmd
== BUF_CMD_WRITE
&& (bp
->b_flags
& B_NEEDCOMMIT
) &&
1211 (nmp
->nm_bioqiods
> nfs_numasync
/ 2)) {
1216 if (nmp
->nm_flag
& NFSMNT_INT
)
1221 * Find a free iod to process this request.
1223 for (i
= 0; i
< NFS_MAXASYNCDAEMON
; i
++)
1224 if (nfs_iodwant
[i
]) {
1226 * Found one, so wake it up and tell it which
1230 ("nfs_asyncio: waking iod %d for mount %p\n",
1232 nfs_iodwant
[i
] = NULL
;
1233 nfs_iodmount
[i
] = nmp
;
1235 wakeup((caddr_t
)&nfs_iodwant
[i
]);
1241 * If none are free, we may already have an iod working on this mount
1242 * point. If so, it will process our request.
1245 if (nmp
->nm_bioqiods
> 0) {
1247 ("nfs_asyncio: %d iods are already processing mount %p\n",
1248 nmp
->nm_bioqiods
, nmp
));
1254 * If we have an iod which can process the request, then queue
1259 * Ensure that the queue never grows too large. We still want
1260 * to asynchronize so we block rather then return EIO.
1262 while (nmp
->nm_bioqlen
>= 2*nfs_numasync
) {
1264 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp
));
1265 nmp
->nm_bioqwant
= TRUE
;
1266 error
= tsleep(&nmp
->nm_bioq
, slpflag
,
1267 "nfsaio", slptimeo
);
1269 if (nfs_sigintr(nmp
, NULL
, td
))
1271 if (slpflag
== PCATCH
) {
1277 * We might have lost our iod while sleeping,
1278 * so check and loop if nescessary.
1280 if (nmp
->nm_bioqiods
== 0) {
1282 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp
));
1289 * The passed bio's buffer is not necessary associated with
1290 * the NFS vnode it is being written to. Store the NFS vnode
1291 * in the BIO driver info.
1293 bio
->bio_driver_info
= vp
;
1294 TAILQ_INSERT_TAIL(&nmp
->nm_bioq
, bio
, bio_act
);
1300 * All the iods are busy on other mounts, so return EIO to
1301 * force the caller to process the i/o synchronously.
1303 NFS_DPF(ASYNCIO
, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
1308 * Do an I/O operation to/from a cache block. This may be called
1309 * synchronously or from an nfsiod. The BIO is normalized for DEV_BSIZE.
1311 * NOTE! TD MIGHT BE NULL
1314 nfs_doio(struct vnode
*vp
, struct bio
*bio
, struct thread
*td
)
1316 struct buf
*bp
= bio
->bio_buf
;
1319 struct nfsmount
*nmp
;
1320 int error
= 0, iomode
, must_commit
= 0;
1324 KKASSERT(vp
->v_tag
== VT_NFS
);
1326 nmp
= VFSTONFS(vp
->v_mount
);
1328 uiop
->uio_iov
= &io
;
1329 uiop
->uio_iovcnt
= 1;
1330 uiop
->uio_segflg
= UIO_SYSSPACE
;
1334 * clear B_ERROR and B_INVAL state prior to initiating the I/O. We
1335 * do this here so we do not have to do it in all the code that
1338 bp
->b_flags
&= ~(B_ERROR
| B_INVAL
);
1341 KASSERT(bp
->b_cmd
!= BUF_CMD_DONE
,
1342 ("nfs_doio: bp %p already marked done!", bp
));
1344 if (bp
->b_cmd
== BUF_CMD_READ
) {
1345 io
.iov_len
= uiop
->uio_resid
= bp
->b_bcount
;
1346 io
.iov_base
= bp
->b_data
;
1347 uiop
->uio_rw
= UIO_READ
;
1349 switch (vp
->v_type
) {
1351 uiop
->uio_offset
= bio
->bio_offset
;
1352 nfsstats
.read_bios
++;
1353 error
= nfs_readrpc(vp
, uiop
);
1356 if (uiop
->uio_resid
) {
1358 * If we had a short read with no error, we must have
1359 * hit a file hole. We should zero-fill the remainder.
1360 * This can also occur if the server hits the file EOF.
1362 * Holes used to be able to occur due to pending
1363 * writes, but that is not possible any longer.
1365 int nread
= bp
->b_bcount
- uiop
->uio_resid
;
1366 int left
= uiop
->uio_resid
;
1369 bzero((char *)bp
->b_data
+ nread
, left
);
1370 uiop
->uio_resid
= 0;
1373 if (td
&& td
->td_proc
&& (vp
->v_flag
& VTEXT
) &&
1374 np
->n_mtime
!= np
->n_vattr
.va_mtime
.tv_sec
) {
1375 uprintf("Process killed due to text file modification\n");
1376 ksignal(td
->td_proc
, SIGKILL
);
1380 uiop
->uio_offset
= 0;
1381 nfsstats
.readlink_bios
++;
1382 error
= nfs_readlinkrpc(vp
, uiop
);
1385 nfsstats
.readdir_bios
++;
1386 uiop
->uio_offset
= bio
->bio_offset
;
1387 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
1388 error
= nfs_readdirplusrpc(vp
, uiop
);
1389 if (error
== NFSERR_NOTSUPP
)
1390 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
1392 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
1393 error
= nfs_readdirrpc(vp
, uiop
);
1395 * end-of-directory sets B_INVAL but does not generate an
1398 if (error
== 0 && uiop
->uio_resid
== bp
->b_bcount
)
1399 bp
->b_flags
|= B_INVAL
;
1402 kprintf("nfs_doio: type %x unexpected\n",vp
->v_type
);
1406 bp
->b_flags
|= B_ERROR
;
1407 bp
->b_error
= error
;
1411 * If we only need to commit, try to commit
1413 KKASSERT(bp
->b_cmd
== BUF_CMD_WRITE
);
1414 if (bp
->b_flags
& B_NEEDCOMMIT
) {
1418 off
= bio
->bio_offset
+ bp
->b_dirtyoff
;
1419 retv
= nfs_commit(vp
, off
,
1420 bp
->b_dirtyend
- bp
->b_dirtyoff
, td
);
1422 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1423 bp
->b_flags
&= ~(B_NEEDCOMMIT
| B_CLUSTEROK
);
1428 if (retv
== NFSERR_STALEWRITEVERF
) {
1429 nfs_clearcommit(vp
->v_mount
);
1434 * Setup for actual write
1437 if (bio
->bio_offset
+ bp
->b_dirtyend
> np
->n_size
)
1438 bp
->b_dirtyend
= np
->n_size
- bio
->bio_offset
;
1440 if (bp
->b_dirtyend
> bp
->b_dirtyoff
) {
1441 io
.iov_len
= uiop
->uio_resid
= bp
->b_dirtyend
1443 uiop
->uio_offset
= bio
->bio_offset
+ bp
->b_dirtyoff
;
1444 io
.iov_base
= (char *)bp
->b_data
+ bp
->b_dirtyoff
;
1445 uiop
->uio_rw
= UIO_WRITE
;
1446 nfsstats
.write_bios
++;
1448 if ((bp
->b_flags
& (B_ASYNC
| B_NEEDCOMMIT
| B_NOCACHE
| B_CLUSTER
)) == B_ASYNC
)
1449 iomode
= NFSV3WRITE_UNSTABLE
;
1451 iomode
= NFSV3WRITE_FILESYNC
;
1453 error
= nfs_writerpc(vp
, uiop
, &iomode
, &must_commit
);
1456 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1457 * to cluster the buffers needing commit. This will allow
1458 * the system to submit a single commit rpc for the whole
1459 * cluster. We can do this even if the buffer is not 100%
1460 * dirty (relative to the NFS blocksize), so we optimize the
1461 * append-to-file-case.
1463 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1464 * cleared because write clustering only works for commit
1465 * rpc's, not for the data portion of the write).
1468 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
) {
1469 bp
->b_flags
|= B_NEEDCOMMIT
;
1470 if (bp
->b_dirtyoff
== 0
1471 && bp
->b_dirtyend
== bp
->b_bcount
)
1472 bp
->b_flags
|= B_CLUSTEROK
;
1474 bp
->b_flags
&= ~(B_NEEDCOMMIT
| B_CLUSTEROK
);
1478 * For an interrupted write, the buffer is still valid
1479 * and the write hasn't been pushed to the server yet,
1480 * so we can't set B_ERROR and report the interruption
1481 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1482 * is not relevant, so the rpc attempt is essentially
1483 * a noop. For the case of a V3 write rpc not being
1484 * committed to stable storage, the block is still
1485 * dirty and requires either a commit rpc or another
1486 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1487 * the block is reused. This is indicated by setting
1488 * the B_DELWRI and B_NEEDCOMMIT flags.
1490 * If the buffer is marked B_PAGING, it does not reside on
1491 * the vp's paging queues so we cannot call bdirty(). The
1492 * bp in this case is not an NFS cache block so we should
1496 || (!error
&& (bp
->b_flags
& B_NEEDCOMMIT
))) {
1498 bp
->b_flags
&= ~(B_INVAL
|B_NOCACHE
);
1499 if ((bp
->b_flags
& B_PAGING
) == 0)
1501 if (error
&& (bp
->b_flags
& B_ASYNC
) == 0)
1502 bp
->b_flags
|= B_EINTR
;
1506 bp
->b_flags
|= B_ERROR
;
1507 bp
->b_error
= np
->n_error
= error
;
1508 np
->n_flag
|= NWRITEERR
;
1510 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1518 bp
->b_resid
= uiop
->uio_resid
;
1520 nfs_clearcommit(vp
->v_mount
);
1526 * Used to aid in handling ftruncate() operations on the NFS client side.
1527 * Truncation creates a number of special problems for NFS. We have to
1528 * throw away VM pages and buffer cache buffers that are beyond EOF, and
1529 * we have to properly handle VM pages or (potentially dirty) buffers
1530 * that straddle the truncation point.
1534 nfs_meta_setsize(struct vnode
*vp
, struct thread
*td
, u_quad_t nsize
)
1536 struct nfsnode
*np
= VTONFS(vp
);
1537 u_quad_t tsize
= np
->n_size
;
1538 int biosize
= vp
->v_mount
->mnt_stat
.f_iosize
;
1543 if (np
->n_size
< tsize
) {
1550 * vtruncbuf() doesn't get the buffer overlapping the
1551 * truncation point. We may have a B_DELWRI and/or B_CACHE
1552 * buffer that now needs to be truncated.
1554 error
= vtruncbuf(vp
, nsize
, biosize
);
1555 lbn
= nsize
/ biosize
;
1556 bufsize
= nsize
& (biosize
- 1);
1557 loffset
= nsize
- bufsize
;
1558 bp
= nfs_getcacheblk(vp
, loffset
, bufsize
, td
);
1559 if (bp
->b_dirtyoff
> bp
->b_bcount
)
1560 bp
->b_dirtyoff
= bp
->b_bcount
;
1561 if (bp
->b_dirtyend
> bp
->b_bcount
)
1562 bp
->b_dirtyend
= bp
->b_bcount
;
1563 bp
->b_flags
|= B_RELBUF
; /* don't leave garbage around */
1566 vnode_pager_setsize(vp
, nsize
);