2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
37 * $FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_bio.c,v 1.130 2004/04/14 23:23:55 peadar Exp $
38 * $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.45 2008/07/18 00:09:39 dillon Exp $
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/resourcevar.h>
45 #include <sys/signalvar.h>
48 #include <sys/vnode.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
52 #include <sys/msfbuf.h>
55 #include <vm/vm_extern.h>
56 #include <vm/vm_page.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_pager.h>
59 #include <vm/vnode_pager.h>
62 #include <sys/thread2.h>
70 #include "nfsm_subs.h"
73 static struct buf
*nfs_getcacheblk(struct vnode
*vp
, off_t loffset
,
74 int size
, struct thread
*td
);
75 static int nfs_check_dirent(struct nfs_dirent
*dp
, int maxlen
);
76 static void nfsiodone_sync(struct bio
*bio
);
77 static void nfs_readrpc_bio_done(nfsm_info_t info
);
78 static void nfs_writerpc_bio_done(nfsm_info_t info
);
79 static void nfs_commitrpc_bio_done(nfsm_info_t info
);
82 * Vnode op for VM getpages.
84 * nfs_getpages(struct vnode *a_vp, vm_page_t *a_m, int a_count,
85 * int a_reqpage, vm_ooffset_t a_offset)
88 nfs_getpages(struct vop_getpages_args
*ap
)
90 struct thread
*td
= curthread
; /* XXX */
91 int i
, error
, nextoff
, size
, toff
, count
, npages
;
102 nmp
= VFSTONFS(vp
->v_mount
);
106 if (vp
->v_object
== NULL
) {
107 kprintf("nfs_getpages: called with non-merged cache vnode??\n");
108 return VM_PAGER_ERROR
;
111 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) != 0 &&
112 (nmp
->nm_state
& NFSSTA_GOTFSINFO
) == 0)
113 (void)nfs_fsinfo(nmp
, vp
, td
);
115 npages
= btoc(count
);
118 * NOTE that partially valid pages may occur in cases other
119 * then file EOF, such as when a file is partially written and
120 * ftruncate()-extended to a larger size. It is also possible
121 * for the valid bits to be set on garbage beyond the file EOF and
122 * clear in the area before EOF (e.g. m->valid == 0xfc), which can
123 * occur due to vtruncbuf() and the buffer cache's handling of
124 * pages which 'straddle' buffers or when b_bufsize is not a
125 * multiple of PAGE_SIZE.... the buffer cache cannot normally
126 * clear the extra bits. This kind of situation occurs when you
127 * make a small write() (m->valid == 0x03) and then mmap() and
128 * fault in the buffer(m->valid = 0xFF). When NFS flushes the
129 * buffer (vinvalbuf() m->valid = 0xFC) we are left with a mess.
131 * This is combined with the possibility that the pages are partially
132 * dirty or that there is a buffer backing the pages that is dirty
133 * (even if m->dirty is 0).
135 * To solve this problem several hacks have been made: (1) NFS
136 * guarentees that the IO block size is a multiple of PAGE_SIZE and
137 * (2) The buffer cache, when invalidating an NFS buffer, will
138 * disregard the buffer's fragmentory b_bufsize and invalidate
139 * the whole page rather then just the piece the buffer owns.
141 * This allows us to assume that a partially valid page found here
142 * is fully valid (vm_fault will zero'd out areas of the page not
145 m
= pages
[ap
->a_reqpage
];
147 for (i
= 0; i
< npages
; ++i
) {
148 if (i
!= ap
->a_reqpage
)
149 vnode_pager_freepage(pages
[i
]);
155 * Use an MSF_BUF as a medium to retrieve data from the pages.
157 msf_map_pagelist(&msf
, pages
, npages
, 0);
159 kva
= msf_buf_kva(msf
);
165 uio
.uio_offset
= IDX_TO_OFF(pages
[0]->pindex
);
166 uio
.uio_resid
= count
;
167 uio
.uio_segflg
= UIO_SYSSPACE
;
168 uio
.uio_rw
= UIO_READ
;
171 error
= nfs_readrpc_uio(vp
, &uio
);
174 if (error
&& ((int)uio
.uio_resid
== count
)) {
175 kprintf("nfs_getpages: error %d\n", error
);
176 for (i
= 0; i
< npages
; ++i
) {
177 if (i
!= ap
->a_reqpage
)
178 vnode_pager_freepage(pages
[i
]);
180 return VM_PAGER_ERROR
;
184 * Calculate the number of bytes read and validate only that number
185 * of bytes. Note that due to pending writes, size may be 0. This
186 * does not mean that the remaining data is invalid!
189 size
= count
- (int)uio
.uio_resid
;
191 for (i
= 0, toff
= 0; i
< npages
; i
++, toff
= nextoff
) {
192 nextoff
= toff
+ PAGE_SIZE
;
195 m
->flags
&= ~PG_ZERO
;
197 if (nextoff
<= size
) {
199 * Read operation filled an entire page
201 m
->valid
= VM_PAGE_BITS_ALL
;
203 } else if (size
> toff
) {
205 * Read operation filled a partial page.
208 vm_page_set_validclean(m
, 0, size
- toff
);
209 /* handled by vm_fault now */
210 /* vm_page_zero_invalid(m, TRUE); */
213 * Read operation was short. If no error occured
214 * we may have hit a zero-fill section. We simply
215 * leave valid set to 0.
219 if (i
!= ap
->a_reqpage
) {
221 * Whether or not to leave the page activated is up in
222 * the air, but we should put the page on a page queue
223 * somewhere (it already is in the object). Result:
224 * It appears that emperical results show that
225 * deactivating pages is best.
229 * Just in case someone was asking for this page we
230 * now tell them that it is ok to use.
233 if (m
->flags
& PG_WANTED
)
236 vm_page_deactivate(m
);
239 vnode_pager_freepage(m
);
247 * Vnode op for VM putpages.
249 * nfs_putpages(struct vnode *a_vp, vm_page_t *a_m, int a_count, int a_sync,
250 * int *a_rtvals, vm_ooffset_t a_offset)
253 nfs_putpages(struct vop_putpages_args
*ap
)
255 struct thread
*td
= curthread
;
259 int iomode
, must_commit
, i
, error
, npages
, count
;
263 struct nfsmount
*nmp
;
270 nmp
= VFSTONFS(vp
->v_mount
);
273 rtvals
= ap
->a_rtvals
;
274 npages
= btoc(count
);
275 offset
= IDX_TO_OFF(pages
[0]->pindex
);
277 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) != 0 &&
278 (nmp
->nm_state
& NFSSTA_GOTFSINFO
) == 0)
279 (void)nfs_fsinfo(nmp
, vp
, td
);
281 for (i
= 0; i
< npages
; i
++) {
282 rtvals
[i
] = VM_PAGER_AGAIN
;
286 * When putting pages, do not extend file past EOF.
289 if (offset
+ count
> np
->n_size
) {
290 count
= np
->n_size
- offset
;
296 * Use an MSF_BUF as a medium to retrieve data from the pages.
298 msf_map_pagelist(&msf
, pages
, npages
, 0);
300 kva
= msf_buf_kva(msf
);
306 uio
.uio_offset
= offset
;
307 uio
.uio_resid
= (size_t)count
;
308 uio
.uio_segflg
= UIO_SYSSPACE
;
309 uio
.uio_rw
= UIO_WRITE
;
312 if ((ap
->a_sync
& VM_PAGER_PUT_SYNC
) == 0)
313 iomode
= NFSV3WRITE_UNSTABLE
;
315 iomode
= NFSV3WRITE_FILESYNC
;
317 error
= nfs_writerpc_uio(vp
, &uio
, &iomode
, &must_commit
);
322 int nwritten
= round_page(count
- (int)uio
.uio_resid
) / PAGE_SIZE
;
323 for (i
= 0; i
< nwritten
; i
++) {
324 rtvals
[i
] = VM_PAGER_OK
;
325 vm_page_undirty(pages
[i
]);
328 nfs_clearcommit(vp
->v_mount
);
334 * Vnode op for read using bio
337 nfs_bioread(struct vnode
*vp
, struct uio
*uio
, int ioflag
)
339 struct nfsnode
*np
= VTONFS(vp
);
341 struct buf
*bp
= 0, *rabp
;
344 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
350 int nra
, error
= 0, n
= 0, on
= 0;
353 if (uio
->uio_rw
!= UIO_READ
)
354 panic("nfs_read mode");
356 if (uio
->uio_resid
== 0)
358 if (uio
->uio_offset
< 0) /* XXX VDIR cookies can be negative */
362 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) != 0 &&
363 (nmp
->nm_state
& NFSSTA_GOTFSINFO
) == 0)
364 (void)nfs_fsinfo(nmp
, vp
, td
);
365 if (vp
->v_type
!= VDIR
&&
366 (uio
->uio_offset
+ uio
->uio_resid
) > nmp
->nm_maxfilesize
)
368 biosize
= vp
->v_mount
->mnt_stat
.f_iosize
;
369 seqcount
= (int)((off_t
)(ioflag
>> IO_SEQSHIFT
) * biosize
/ BKVASIZE
);
372 * For nfs, cache consistency can only be maintained approximately.
373 * Although RFC1094 does not specify the criteria, the following is
374 * believed to be compatible with the reference port.
376 * NFS: If local changes have been made and this is a
377 * directory, the directory must be invalidated and
378 * the attribute cache must be cleared.
380 * GETATTR is called to synchronize the file size.
382 * If remote changes are detected local data is flushed
383 * and the cache is invalidated.
385 * NOTE: In the normal case the attribute cache is not
386 * cleared which means GETATTR may use cached data and
387 * not immediately detect changes made on the server.
389 if ((np
->n_flag
& NLMODIFIED
) && vp
->v_type
== VDIR
) {
391 error
= nfs_vinvalbuf(vp
, V_SAVE
, 1);
396 error
= VOP_GETATTR(vp
, &vattr
);
399 if (np
->n_flag
& NRMODIFIED
) {
400 if (vp
->v_type
== VDIR
)
402 error
= nfs_vinvalbuf(vp
, V_SAVE
, 1);
405 np
->n_flag
&= ~NRMODIFIED
;
408 if (np
->n_flag
& NDONTCACHE
) {
409 switch (vp
->v_type
) {
411 return (nfs_readrpc_uio(vp
, uio
));
413 return (nfs_readlinkrpc_uio(vp
, uio
));
417 kprintf(" NDONTCACHE: type %x unexpected\n", vp
->v_type
);
421 switch (vp
->v_type
) {
423 nfsstats
.biocache_reads
++;
424 lbn
= uio
->uio_offset
/ biosize
;
425 on
= uio
->uio_offset
& (biosize
- 1);
426 loffset
= (off_t
)lbn
* biosize
;
429 * Start the read ahead(s), as required.
431 if (nmp
->nm_readahead
> 0 && nfs_asyncok(nmp
)) {
432 for (nra
= 0; nra
< nmp
->nm_readahead
&& nra
< seqcount
&&
433 (off_t
)(lbn
+ 1 + nra
) * biosize
< np
->n_size
; nra
++) {
434 rabn
= lbn
+ 1 + nra
;
435 raoffset
= (off_t
)rabn
* biosize
;
436 if (findblk(vp
, raoffset
, FINDBLK_TEST
) == NULL
) {
437 rabp
= nfs_getcacheblk(vp
, raoffset
, biosize
, td
);
440 if ((rabp
->b_flags
& (B_CACHE
|B_DELWRI
)) == 0) {
441 rabp
->b_cmd
= BUF_CMD_READ
;
442 vfs_busy_pages(vp
, rabp
);
443 nfs_asyncio(vp
, &rabp
->b_bio2
);
452 * Obtain the buffer cache block. Figure out the buffer size
453 * when we are at EOF. If we are modifying the size of the
454 * buffer based on an EOF condition we need to hold
455 * nfs_rslock() through obtaining the buffer to prevent
456 * a potential writer-appender from messing with n_size.
457 * Otherwise we may accidently truncate the buffer and
460 * Note that bcount is *not* DEV_BSIZE aligned.
465 if (loffset
>= np
->n_size
) {
467 } else if (loffset
+ biosize
> np
->n_size
) {
468 bcount
= np
->n_size
- loffset
;
470 if (bcount
!= biosize
) {
471 switch(nfs_rslock(np
)) {
484 bp
= nfs_getcacheblk(vp
, loffset
, bcount
, td
);
486 if (bcount
!= biosize
)
492 * If B_CACHE is not set, we must issue the read. If this
493 * fails, we return an error.
496 if ((bp
->b_flags
& B_CACHE
) == 0) {
497 bp
->b_cmd
= BUF_CMD_READ
;
498 bp
->b_bio2
.bio_done
= nfsiodone_sync
;
499 bp
->b_bio2
.bio_flags
|= BIO_SYNC
;
500 vfs_busy_pages(vp
, bp
);
501 error
= nfs_doio(vp
, &bp
->b_bio2
, td
);
509 * on is the offset into the current bp. Figure out how many
510 * bytes we can copy out of the bp. Note that bcount is
511 * NOT DEV_BSIZE aligned.
513 * Then figure out how many bytes we can copy into the uio.
518 n
= (int)szmin((unsigned)(bcount
- on
), uio
->uio_resid
);
521 biosize
= min(NFS_MAXPATHLEN
, np
->n_size
);
522 nfsstats
.biocache_readlinks
++;
523 bp
= nfs_getcacheblk(vp
, (off_t
)0, biosize
, td
);
526 if ((bp
->b_flags
& B_CACHE
) == 0) {
527 bp
->b_cmd
= BUF_CMD_READ
;
528 bp
->b_bio2
.bio_done
= nfsiodone_sync
;
529 bp
->b_bio2
.bio_flags
|= BIO_SYNC
;
530 vfs_busy_pages(vp
, bp
);
531 error
= nfs_doio(vp
, &bp
->b_bio2
, td
);
533 bp
->b_flags
|= B_ERROR
| B_INVAL
;
538 n
= (int)szmin(uio
->uio_resid
, bp
->b_bcount
- bp
->b_resid
);
542 nfsstats
.biocache_readdirs
++;
543 if (np
->n_direofoffset
544 && uio
->uio_offset
>= np
->n_direofoffset
) {
547 lbn
= (uoff_t
)uio
->uio_offset
/ NFS_DIRBLKSIZ
;
548 on
= uio
->uio_offset
& (NFS_DIRBLKSIZ
- 1);
549 loffset
= uio
->uio_offset
- on
;
550 bp
= nfs_getcacheblk(vp
, loffset
, NFS_DIRBLKSIZ
, td
);
554 if ((bp
->b_flags
& B_CACHE
) == 0) {
555 bp
->b_cmd
= BUF_CMD_READ
;
556 bp
->b_bio2
.bio_done
= nfsiodone_sync
;
557 bp
->b_bio2
.bio_flags
|= BIO_SYNC
;
558 vfs_busy_pages(vp
, bp
);
559 error
= nfs_doio(vp
, &bp
->b_bio2
, td
);
562 while (error
== NFSERR_BAD_COOKIE
) {
563 kprintf("got bad cookie vp %p bp %p\n", vp
, bp
);
565 error
= nfs_vinvalbuf(vp
, 0, 1);
567 * Yuck! The directory has been modified on the
568 * server. The only way to get the block is by
569 * reading from the beginning to get all the
572 * Leave the last bp intact unless there is an error.
573 * Loop back up to the while if the error is another
574 * NFSERR_BAD_COOKIE (double yuch!).
576 for (i
= 0; i
<= lbn
&& !error
; i
++) {
577 if (np
->n_direofoffset
578 && (i
* NFS_DIRBLKSIZ
) >= np
->n_direofoffset
)
580 bp
= nfs_getcacheblk(vp
, (off_t
)i
* NFS_DIRBLKSIZ
,
584 if ((bp
->b_flags
& B_CACHE
) == 0) {
585 bp
->b_cmd
= BUF_CMD_READ
;
586 bp
->b_bio2
.bio_done
= nfsiodone_sync
;
587 bp
->b_bio2
.bio_flags
|= BIO_SYNC
;
588 vfs_busy_pages(vp
, bp
);
589 error
= nfs_doio(vp
, &bp
->b_bio2
, td
);
591 * no error + B_INVAL == directory EOF,
594 if (error
== 0 && (bp
->b_flags
& B_INVAL
))
598 * An error will throw away the block and the
599 * for loop will break out. If no error and this
600 * is not the block we want, we throw away the
601 * block and go for the next one via the for loop.
603 if (error
|| i
< lbn
)
608 * The above while is repeated if we hit another cookie
609 * error. If we hit an error and it wasn't a cookie error,
617 * If not eof and read aheads are enabled, start one.
618 * (You need the current block first, so that you have the
619 * directory offset cookie of the next block.)
621 if (nmp
->nm_readahead
> 0 && nfs_asyncok(nmp
) &&
622 (bp
->b_flags
& B_INVAL
) == 0 &&
623 (np
->n_direofoffset
== 0 ||
624 loffset
+ NFS_DIRBLKSIZ
< np
->n_direofoffset
) &&
625 (np
->n_flag
& NDONTCACHE
) == 0 &&
626 findblk(vp
, loffset
+ NFS_DIRBLKSIZ
, FINDBLK_TEST
) == NULL
628 rabp
= nfs_getcacheblk(vp
, loffset
+ NFS_DIRBLKSIZ
,
631 if ((rabp
->b_flags
& (B_CACHE
|B_DELWRI
)) == 0) {
632 rabp
->b_cmd
= BUF_CMD_READ
;
633 vfs_busy_pages(vp
, rabp
);
634 nfs_asyncio(vp
, &rabp
->b_bio2
);
641 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
642 * chopped for the EOF condition, we cannot tell how large
643 * NFS directories are going to be until we hit EOF. So
644 * an NFS directory buffer is *not* chopped to its EOF. Now,
645 * it just so happens that b_resid will effectively chop it
646 * to EOF. *BUT* this information is lost if the buffer goes
647 * away and is reconstituted into a B_CACHE state ( due to
648 * being VMIO ) later. So we keep track of the directory eof
649 * in np->n_direofoffset and chop it off as an extra step
652 n
= (int)szmin(uio
->uio_resid
,
653 NFS_DIRBLKSIZ
- bp
->b_resid
- on
);
654 if (np
->n_direofoffset
&& n
> np
->n_direofoffset
- uio
->uio_offset
)
655 n
= np
->n_direofoffset
- uio
->uio_offset
;
658 kprintf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
662 switch (vp
->v_type
) {
665 error
= uiomove(bp
->b_data
+ on
, (int)n
, uio
);
669 error
= uiomove(bp
->b_data
+ on
, (int)n
, uio
);
674 off_t old_off
= uio
->uio_offset
;
676 struct nfs_dirent
*dp
;
679 * We are casting cpos to nfs_dirent, it must be
687 cpos
= bp
->b_data
+ on
;
688 epos
= bp
->b_data
+ on
+ n
;
689 while (cpos
< epos
&& error
== 0 && uio
->uio_resid
> 0) {
690 dp
= (struct nfs_dirent
*)cpos
;
691 error
= nfs_check_dirent(dp
, (int)(epos
- cpos
));
694 if (vop_write_dirent(&error
, uio
, dp
->nfs_ino
,
695 dp
->nfs_type
, dp
->nfs_namlen
, dp
->nfs_name
)) {
698 cpos
+= dp
->nfs_reclen
;
702 uio
->uio_offset
= old_off
+ cpos
- bp
->b_data
- on
;
705 * Invalidate buffer if caching is disabled, forcing a
706 * re-read from the remote later.
708 if (np
->n_flag
& NDONTCACHE
)
709 bp
->b_flags
|= B_INVAL
;
712 kprintf(" nfs_bioread: type %x unexpected\n",vp
->v_type
);
715 } while (error
== 0 && uio
->uio_resid
> 0 && n
> 0);
720 * Userland can supply any 'seek' offset when reading a NFS directory.
721 * Validate the structure so we don't panic the kernel. Note that
722 * the element name is nul terminated and the nul is not included
727 nfs_check_dirent(struct nfs_dirent
*dp
, int maxlen
)
729 int nfs_name_off
= offsetof(struct nfs_dirent
, nfs_name
[0]);
731 if (nfs_name_off
>= maxlen
)
733 if (dp
->nfs_reclen
< nfs_name_off
|| dp
->nfs_reclen
> maxlen
)
735 if (nfs_name_off
+ dp
->nfs_namlen
>= dp
->nfs_reclen
)
737 if (dp
->nfs_reclen
& 3)
743 * Vnode op for write using bio
745 * nfs_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag,
746 * struct ucred *a_cred)
749 nfs_write(struct vop_write_args
*ap
)
751 struct uio
*uio
= ap
->a_uio
;
752 struct thread
*td
= uio
->uio_td
;
753 struct vnode
*vp
= ap
->a_vp
;
754 struct nfsnode
*np
= VTONFS(vp
);
755 int ioflag
= ap
->a_ioflag
;
758 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
761 int n
, on
, error
= 0, iomode
, must_commit
;
767 if (uio
->uio_rw
!= UIO_WRITE
)
768 panic("nfs_write mode");
769 if (uio
->uio_segflg
== UIO_USERSPACE
&& uio
->uio_td
!= curthread
)
770 panic("nfs_write proc");
772 if (vp
->v_type
!= VREG
)
774 if (np
->n_flag
& NWRITEERR
) {
775 np
->n_flag
&= ~NWRITEERR
;
776 return (np
->n_error
);
778 if ((nmp
->nm_flag
& NFSMNT_NFSV3
) != 0 &&
779 (nmp
->nm_state
& NFSSTA_GOTFSINFO
) == 0)
780 (void)nfs_fsinfo(nmp
, vp
, td
);
783 * Synchronously flush pending buffers if we are in synchronous
784 * mode or if we are appending.
786 if (ioflag
& (IO_APPEND
| IO_SYNC
)) {
787 if (np
->n_flag
& NLMODIFIED
) {
789 error
= nfs_flush(vp
, MNT_WAIT
, td
, 0);
790 /* error = nfs_vinvalbuf(vp, V_SAVE, 1); */
797 * If IO_APPEND then load uio_offset. We restart here if we cannot
798 * get the append lock.
801 if (ioflag
& IO_APPEND
) {
803 error
= VOP_GETATTR(vp
, &vattr
);
806 uio
->uio_offset
= np
->n_size
;
809 if (uio
->uio_offset
< 0)
811 if ((uio
->uio_offset
+ uio
->uio_resid
) > nmp
->nm_maxfilesize
)
813 if (uio
->uio_resid
== 0)
817 * We need to obtain the rslock if we intend to modify np->n_size
818 * in order to guarentee the append point with multiple contending
819 * writers, to guarentee that no other appenders modify n_size
820 * while we are trying to obtain a truncated buffer (i.e. to avoid
821 * accidently truncating data written by another appender due to
822 * the race), and to ensure that the buffer is populated prior to
823 * our extending of the file. We hold rslock through the entire
826 * Note that we do not synchronize the case where someone truncates
827 * the file while we are appending to it because attempting to lock
828 * this case may deadlock other parts of the system unexpectedly.
830 if ((ioflag
& IO_APPEND
) ||
831 uio
->uio_offset
+ uio
->uio_resid
> np
->n_size
) {
832 switch(nfs_rslock(np
)) {
847 * Maybe this should be above the vnode op call, but so long as
848 * file servers have no limits, i don't think it matters
850 if (td
->td_proc
&& uio
->uio_offset
+ uio
->uio_resid
>
851 td
->td_proc
->p_rlimit
[RLIMIT_FSIZE
].rlim_cur
) {
852 lwpsignal(td
->td_proc
, td
->td_lwp
, SIGXFSZ
);
858 biosize
= vp
->v_mount
->mnt_stat
.f_iosize
;
861 if ((np
->n_flag
& NDONTCACHE
) && uio
->uio_iovcnt
== 1) {
862 iomode
= NFSV3WRITE_FILESYNC
;
863 error
= nfs_writerpc_uio(vp
, uio
, &iomode
, &must_commit
);
865 nfs_clearcommit(vp
->v_mount
);
868 nfsstats
.biocache_writes
++;
869 lbn
= uio
->uio_offset
/ biosize
;
870 on
= uio
->uio_offset
& (biosize
-1);
871 loffset
= uio
->uio_offset
- on
;
872 n
= (int)szmin((unsigned)(biosize
- on
), uio
->uio_resid
);
875 * Handle direct append and file extension cases, calculate
876 * unaligned buffer size.
879 if (uio
->uio_offset
== np
->n_size
&& n
) {
881 * Get the buffer (in its pre-append state to maintain
882 * B_CACHE if it was previously set). Resize the
883 * nfsnode after we have locked the buffer to prevent
884 * readers from reading garbage.
887 bp
= nfs_getcacheblk(vp
, loffset
, bcount
, td
);
892 np
->n_size
= uio
->uio_offset
+ n
;
893 np
->n_flag
|= NLMODIFIED
;
894 vnode_pager_setsize(vp
, np
->n_size
);
896 save
= bp
->b_flags
& B_CACHE
;
898 allocbuf(bp
, bcount
);
903 * Obtain the locked cache block first, and then
904 * adjust the file's size as appropriate.
907 if (loffset
+ bcount
< np
->n_size
) {
908 if (loffset
+ biosize
< np
->n_size
)
911 bcount
= np
->n_size
- loffset
;
913 bp
= nfs_getcacheblk(vp
, loffset
, bcount
, td
);
914 if (uio
->uio_offset
+ n
> np
->n_size
) {
915 np
->n_size
= uio
->uio_offset
+ n
;
916 np
->n_flag
|= NLMODIFIED
;
917 vnode_pager_setsize(vp
, np
->n_size
);
927 * Issue a READ if B_CACHE is not set. In special-append
928 * mode, B_CACHE is based on the buffer prior to the write
929 * op and is typically set, avoiding the read. If a read
930 * is required in special append mode, the server will
931 * probably send us a short-read since we extended the file
932 * on our end, resulting in b_resid == 0 and, thusly,
933 * B_CACHE getting set.
935 * We can also avoid issuing the read if the write covers
936 * the entire buffer. We have to make sure the buffer state
937 * is reasonable in this case since we will not be initiating
938 * I/O. See the comments in kern/vfs_bio.c's getblk() for
941 * B_CACHE may also be set due to the buffer being cached
944 * When doing a UIO_NOCOPY write the buffer is not
945 * overwritten and we cannot just set B_CACHE unconditionally
946 * for full-block writes.
949 if (on
== 0 && n
== bcount
&& uio
->uio_segflg
!= UIO_NOCOPY
) {
950 bp
->b_flags
|= B_CACHE
;
951 bp
->b_flags
&= ~(B_ERROR
| B_INVAL
);
954 if ((bp
->b_flags
& B_CACHE
) == 0) {
955 bp
->b_cmd
= BUF_CMD_READ
;
956 bp
->b_bio2
.bio_done
= nfsiodone_sync
;
957 bp
->b_bio2
.bio_flags
|= BIO_SYNC
;
958 vfs_busy_pages(vp
, bp
);
959 error
= nfs_doio(vp
, &bp
->b_bio2
, td
);
969 np
->n_flag
|= NLMODIFIED
;
972 * If dirtyend exceeds file size, chop it down. This should
973 * not normally occur but there is an append race where it
974 * might occur XXX, so we log it.
976 * If the chopping creates a reverse-indexed or degenerate
977 * situation with dirtyoff/end, we 0 both of them.
980 if (bp
->b_dirtyend
> bcount
) {
981 kprintf("NFS append race @%08llx:%d\n",
982 (long long)bp
->b_bio2
.bio_offset
,
983 bp
->b_dirtyend
- bcount
);
984 bp
->b_dirtyend
= bcount
;
987 if (bp
->b_dirtyoff
>= bp
->b_dirtyend
)
988 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
991 * If the new write will leave a contiguous dirty
992 * area, just update the b_dirtyoff and b_dirtyend,
993 * otherwise force a write rpc of the old dirty area.
995 * While it is possible to merge discontiguous writes due to
996 * our having a B_CACHE buffer ( and thus valid read data
997 * for the hole), we don't because it could lead to
998 * significant cache coherency problems with multiple clients,
999 * especially if locking is implemented later on.
1001 * as an optimization we could theoretically maintain
1002 * a linked list of discontinuous areas, but we would still
1003 * have to commit them separately so there isn't much
1004 * advantage to it except perhaps a bit of asynchronization.
1007 if (bp
->b_dirtyend
> 0 &&
1008 (on
> bp
->b_dirtyend
|| (on
+ n
) < bp
->b_dirtyoff
)) {
1009 if (bwrite(bp
) == EINTR
) {
1016 error
= uiomove((char *)bp
->b_data
+ on
, n
, uio
);
1019 * Since this block is being modified, it must be written
1020 * again and not just committed. Since write clustering does
1021 * not work for the stage 1 data write, only the stage 2
1022 * commit rpc, we have to clear B_CLUSTEROK as well.
1024 bp
->b_flags
&= ~(B_NEEDCOMMIT
| B_CLUSTEROK
);
1027 bp
->b_flags
|= B_ERROR
;
1033 * Only update dirtyoff/dirtyend if not a degenerate
1037 if (bp
->b_dirtyend
> 0) {
1038 bp
->b_dirtyoff
= min(on
, bp
->b_dirtyoff
);
1039 bp
->b_dirtyend
= max((on
+ n
), bp
->b_dirtyend
);
1041 bp
->b_dirtyoff
= on
;
1042 bp
->b_dirtyend
= on
+ n
;
1044 vfs_bio_set_validclean(bp
, on
, n
);
1048 * If the lease is non-cachable or IO_SYNC do bwrite().
1050 * IO_INVAL appears to be unused. The idea appears to be
1051 * to turn off caching in this case. Very odd. XXX
1053 * If nfs_async is set bawrite() will use an unstable write
1054 * (build dirty bufs on the server), so we might as well
1055 * push it out with bawrite(). If nfs_async is not set we
1056 * use bdwrite() to cache dirty bufs on the client.
1058 if ((np
->n_flag
& NDONTCACHE
) || (ioflag
& IO_SYNC
)) {
1059 if (ioflag
& IO_INVAL
)
1060 bp
->b_flags
|= B_NOCACHE
;
1064 if (np
->n_flag
& NDONTCACHE
) {
1065 error
= nfs_vinvalbuf(vp
, V_SAVE
, 1);
1069 } else if ((n
+ on
) == biosize
&& nfs_async
) {
1074 } while (uio
->uio_resid
> 0 && n
> 0);
1083 * Get an nfs cache block.
1085 * Allocate a new one if the block isn't currently in the cache
1086 * and return the block marked busy. If the calling process is
1087 * interrupted by a signal for an interruptible mount point, return
1090 * The caller must carefully deal with the possible B_INVAL state of
1091 * the buffer. nfs_startio() clears B_INVAL (and nfs_asyncio() clears it
1092 * indirectly), so synchronous reads can be issued without worrying about
1093 * the B_INVAL state. We have to be a little more careful when dealing
1094 * with writes (see comments in nfs_write()) when extending a file past
1098 nfs_getcacheblk(struct vnode
*vp
, off_t loffset
, int size
, struct thread
*td
)
1102 struct nfsmount
*nmp
;
1107 if (nmp
->nm_flag
& NFSMNT_INT
) {
1108 bp
= getblk(vp
, loffset
, size
, GETBLK_PCATCH
, 0);
1109 while (bp
== NULL
) {
1110 if (nfs_sigintr(nmp
, NULL
, td
))
1112 bp
= getblk(vp
, loffset
, size
, 0, 2 * hz
);
1115 bp
= getblk(vp
, loffset
, size
, 0, 0);
1119 * bio2, the 'device' layer. Since BIOs use 64 bit byte offsets
1120 * now, no translation is necessary.
1122 bp
->b_bio2
.bio_offset
= loffset
;
1127 * Flush and invalidate all dirty buffers. If another process is already
1128 * doing the flush, just wait for completion.
1131 nfs_vinvalbuf(struct vnode
*vp
, int flags
, int intrflg
)
1133 struct nfsnode
*np
= VTONFS(vp
);
1134 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
1135 int error
= 0, slpflag
, slptimeo
;
1136 thread_t td
= curthread
;
1138 if (vp
->v_flag
& VRECLAIMED
)
1141 if ((nmp
->nm_flag
& NFSMNT_INT
) == 0)
1151 * First wait for any other process doing a flush to complete.
1153 while (np
->n_flag
& NFLUSHINPROG
) {
1154 np
->n_flag
|= NFLUSHWANT
;
1155 error
= tsleep((caddr_t
)&np
->n_flag
, 0, "nfsvinval", slptimeo
);
1156 if (error
&& intrflg
&& nfs_sigintr(nmp
, NULL
, td
))
1161 * Now, flush as required.
1163 np
->n_flag
|= NFLUSHINPROG
;
1164 error
= vinvalbuf(vp
, flags
, slpflag
, 0);
1166 if (intrflg
&& nfs_sigintr(nmp
, NULL
, td
)) {
1167 np
->n_flag
&= ~NFLUSHINPROG
;
1168 if (np
->n_flag
& NFLUSHWANT
) {
1169 np
->n_flag
&= ~NFLUSHWANT
;
1170 wakeup((caddr_t
)&np
->n_flag
);
1174 error
= vinvalbuf(vp
, flags
, 0, slptimeo
);
1176 np
->n_flag
&= ~(NLMODIFIED
| NFLUSHINPROG
);
1177 if (np
->n_flag
& NFLUSHWANT
) {
1178 np
->n_flag
&= ~NFLUSHWANT
;
1179 wakeup((caddr_t
)&np
->n_flag
);
1185 * Return true (non-zero) if the txthread and rxthread are operational
1186 * and we do not already have too many not-yet-started BIO's built up.
1189 nfs_asyncok(struct nfsmount
*nmp
)
1191 return (nmp
->nm_bioqlen
< nfs_maxasyncbio
&&
1192 nmp
->nm_bioqlen
< nmp
->nm_maxasync_scaled
/ NFS_ASYSCALE
&&
1193 nmp
->nm_rxstate
<= NFSSVC_PENDING
&&
1194 nmp
->nm_txstate
<= NFSSVC_PENDING
);
1198 * The read-ahead code calls this to queue a bio to the txthread.
1200 * We don't touch the bio otherwise... that is, we do not even
1201 * construct or send the initial rpc. The txthread will do it
1204 * NOTE! nm_bioqlen is not decremented until the request completes,
1205 * so it does not reflect the number of bio's on bioq.
1208 nfs_asyncio(struct vnode
*vp
, struct bio
*bio
)
1210 struct buf
*bp
= bio
->bio_buf
;
1211 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
1213 KKASSERT(vp
->v_tag
== VT_NFS
);
1215 bio
->bio_driver_info
= vp
;
1217 TAILQ_INSERT_TAIL(&nmp
->nm_bioq
, bio
, bio_act
);
1218 atomic_add_int(&nmp
->nm_bioqlen
, 1);
1220 nfssvc_iod_writer_wakeup(nmp
);
1224 * nfs_dio() - Execute a BIO operation synchronously. The BIO will be
1225 * completed and its error returned. The caller is responsible
1226 * for brelse()ing it. ONLY USE FOR BIO_SYNC IOs! Otherwise
1227 * our error probe will be against an invalid pointer.
1229 * nfs_startio()- Execute a BIO operation assynchronously.
1231 * NOTE: nfs_asyncio() is used to initiate an asynchronous BIO operation,
1232 * which basically just queues it to the txthread. nfs_startio()
1233 * actually initiates the I/O AFTER it has gotten to the txthread.
1235 * NOTE: td might be NULL.
1238 nfs_startio(struct vnode
*vp
, struct bio
*bio
, struct thread
*td
)
1240 struct buf
*bp
= bio
->bio_buf
;
1242 struct nfsmount
*nmp
;
1244 KKASSERT(vp
->v_tag
== VT_NFS
);
1246 nmp
= VFSTONFS(vp
->v_mount
);
1249 * clear B_ERROR and B_INVAL state prior to initiating the I/O. We
1250 * do this here so we do not have to do it in all the code that
1253 bp
->b_flags
&= ~(B_ERROR
| B_INVAL
);
1255 KASSERT(bp
->b_cmd
!= BUF_CMD_DONE
,
1256 ("nfs_doio: bp %p already marked done!", bp
));
1258 if (bp
->b_cmd
== BUF_CMD_READ
) {
1259 switch (vp
->v_type
) {
1261 nfsstats
.read_bios
++;
1262 nfs_readrpc_bio(vp
, bio
);
1266 bio
->bio_offset
= 0;
1267 nfsstats
.readlink_bios
++;
1268 nfs_readlinkrpc_bio(vp
, bio
);
1270 nfs_doio(vp
, bio
, td
);
1275 * NOTE: If nfs_readdirplusrpc_bio() is requested but
1276 * not supported, it will chain to
1277 * nfs_readdirrpc_bio().
1280 nfsstats
.readdir_bios
++;
1281 uiop
->uio_offset
= bio
->bio_offset
;
1282 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
)
1283 nfs_readdirplusrpc_bio(vp
, bio
);
1285 nfs_readdirrpc_bio(vp
, bio
);
1287 nfs_doio(vp
, bio
, td
);
1291 kprintf("nfs_doio: type %x unexpected\n",vp
->v_type
);
1292 bp
->b_flags
|= B_ERROR
;
1293 bp
->b_error
= EINVAL
;
1299 * If we only need to commit, try to commit. If this fails
1300 * it will chain through to the write. Basically all the logic
1301 * in nfs_doio() is replicated.
1303 KKASSERT(bp
->b_cmd
== BUF_CMD_WRITE
);
1304 if (bp
->b_flags
& B_NEEDCOMMIT
)
1305 nfs_commitrpc_bio(vp
, bio
);
1307 nfs_writerpc_bio(vp
, bio
);
1312 nfs_doio(struct vnode
*vp
, struct bio
*bio
, struct thread
*td
)
1314 struct buf
*bp
= bio
->bio_buf
;
1317 struct nfsmount
*nmp
;
1319 int iomode
, must_commit
;
1323 KKASSERT(vp
->v_tag
== VT_NFS
);
1325 nmp
= VFSTONFS(vp
->v_mount
);
1327 uiop
->uio_iov
= &io
;
1328 uiop
->uio_iovcnt
= 1;
1329 uiop
->uio_segflg
= UIO_SYSSPACE
;
1333 * clear B_ERROR and B_INVAL state prior to initiating the I/O. We
1334 * do this here so we do not have to do it in all the code that
1337 bp
->b_flags
&= ~(B_ERROR
| B_INVAL
);
1339 KASSERT(bp
->b_cmd
!= BUF_CMD_DONE
,
1340 ("nfs_doio: bp %p already marked done!", bp
));
1342 if (bp
->b_cmd
== BUF_CMD_READ
) {
1343 io
.iov_len
= uiop
->uio_resid
= (size_t)bp
->b_bcount
;
1344 io
.iov_base
= bp
->b_data
;
1345 uiop
->uio_rw
= UIO_READ
;
1347 switch (vp
->v_type
) {
1349 nfsstats
.read_bios
++;
1350 uiop
->uio_offset
= bio
->bio_offset
;
1351 error
= nfs_readrpc_uio(vp
, uiop
);
1353 if (uiop
->uio_resid
) {
1355 * If we had a short read with no error, we must have
1356 * hit a file hole. We should zero-fill the remainder.
1357 * This can also occur if the server hits the file EOF.
1359 * Holes used to be able to occur due to pending
1360 * writes, but that is not possible any longer.
1362 int nread
= bp
->b_bcount
- bp
->b_resid
;
1363 int left
= bp
->b_resid
;
1366 bzero((char *)bp
->b_data
+ nread
, left
);
1370 if (td
&& td
->td_proc
&& (vp
->v_flag
& VTEXT
) &&
1371 np
->n_mtime
!= np
->n_vattr
.va_mtime
.tv_sec
) {
1372 uprintf("Process killed due to text file modification\n");
1373 ksignal(td
->td_proc
, SIGKILL
);
1377 uiop
->uio_offset
= 0;
1378 nfsstats
.readlink_bios
++;
1379 error
= nfs_readlinkrpc_uio(vp
, uiop
);
1382 nfsstats
.readdir_bios
++;
1383 uiop
->uio_offset
= bio
->bio_offset
;
1384 if (nmp
->nm_flag
& NFSMNT_RDIRPLUS
) {
1385 error
= nfs_readdirplusrpc_uio(vp
, uiop
);
1386 if (error
== NFSERR_NOTSUPP
)
1387 nmp
->nm_flag
&= ~NFSMNT_RDIRPLUS
;
1389 if ((nmp
->nm_flag
& NFSMNT_RDIRPLUS
) == 0)
1390 error
= nfs_readdirrpc_uio(vp
, uiop
);
1392 * end-of-directory sets B_INVAL but does not generate an
1395 if (error
== 0 && uiop
->uio_resid
== bp
->b_bcount
)
1396 bp
->b_flags
|= B_INVAL
;
1399 kprintf("nfs_doio: type %x unexpected\n",vp
->v_type
);
1403 bp
->b_flags
|= B_ERROR
;
1404 bp
->b_error
= error
;
1406 bp
->b_resid
= uiop
->uio_resid
;
1409 * If we only need to commit, try to commit
1411 KKASSERT(bp
->b_cmd
== BUF_CMD_WRITE
);
1412 if (bp
->b_flags
& B_NEEDCOMMIT
) {
1416 off
= bio
->bio_offset
+ bp
->b_dirtyoff
;
1417 retv
= nfs_commitrpc_uio(vp
, off
,
1418 bp
->b_dirtyend
- bp
->b_dirtyoff
,
1421 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1422 bp
->b_flags
&= ~(B_NEEDCOMMIT
| B_CLUSTEROK
);
1427 if (retv
== NFSERR_STALEWRITEVERF
) {
1428 nfs_clearcommit(vp
->v_mount
);
1433 * Setup for actual write
1435 if (bio
->bio_offset
+ bp
->b_dirtyend
> np
->n_size
)
1436 bp
->b_dirtyend
= np
->n_size
- bio
->bio_offset
;
1438 if (bp
->b_dirtyend
> bp
->b_dirtyoff
) {
1439 io
.iov_len
= uiop
->uio_resid
= bp
->b_dirtyend
1441 uiop
->uio_offset
= bio
->bio_offset
+ bp
->b_dirtyoff
;
1442 io
.iov_base
= (char *)bp
->b_data
+ bp
->b_dirtyoff
;
1443 uiop
->uio_rw
= UIO_WRITE
;
1444 nfsstats
.write_bios
++;
1446 if ((bp
->b_flags
& (B_NEEDCOMMIT
| B_NOCACHE
| B_CLUSTER
)) == 0)
1447 iomode
= NFSV3WRITE_UNSTABLE
;
1449 iomode
= NFSV3WRITE_FILESYNC
;
1452 error
= nfs_writerpc_uio(vp
, uiop
, &iomode
, &must_commit
);
1455 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1456 * to cluster the buffers needing commit. This will allow
1457 * the system to submit a single commit rpc for the whole
1458 * cluster. We can do this even if the buffer is not 100%
1459 * dirty (relative to the NFS blocksize), so we optimize the
1460 * append-to-file-case.
1462 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1463 * cleared because write clustering only works for commit
1464 * rpc's, not for the data portion of the write).
1467 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
) {
1468 bp
->b_flags
|= B_NEEDCOMMIT
;
1469 if (bp
->b_dirtyoff
== 0
1470 && bp
->b_dirtyend
== bp
->b_bcount
)
1471 bp
->b_flags
|= B_CLUSTEROK
;
1473 bp
->b_flags
&= ~(B_NEEDCOMMIT
| B_CLUSTEROK
);
1477 * For an interrupted write, the buffer is still valid
1478 * and the write hasn't been pushed to the server yet,
1479 * so we can't set B_ERROR and report the interruption
1480 * by setting B_EINTR. For the async case, B_EINTR
1481 * is not relevant, so the rpc attempt is essentially
1482 * a noop. For the case of a V3 write rpc not being
1483 * committed to stable storage, the block is still
1484 * dirty and requires either a commit rpc or another
1485 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1486 * the block is reused. This is indicated by setting
1487 * the B_DELWRI and B_NEEDCOMMIT flags.
1489 * If the buffer is marked B_PAGING, it does not reside on
1490 * the vp's paging queues so we cannot call bdirty(). The
1491 * bp in this case is not an NFS cache block so we should
1495 || (!error
&& (bp
->b_flags
& B_NEEDCOMMIT
))) {
1497 bp
->b_flags
&= ~(B_INVAL
|B_NOCACHE
);
1498 if ((bp
->b_flags
& B_PAGING
) == 0)
1501 bp
->b_flags
|= B_EINTR
;
1505 bp
->b_flags
|= B_ERROR
;
1506 bp
->b_error
= np
->n_error
= error
;
1507 np
->n_flag
|= NWRITEERR
;
1509 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1512 nfs_clearcommit(vp
->v_mount
);
1513 bp
->b_resid
= uiop
->uio_resid
;
1520 * I/O was run synchronously, biodone() it and calculate the
1524 KKASSERT(bp
->b_cmd
== BUF_CMD_DONE
);
1525 if (bp
->b_flags
& B_EINTR
)
1527 if (bp
->b_flags
& B_ERROR
)
1528 return (bp
->b_error
? bp
->b_error
: EIO
);
1533 * Used to aid in handling ftruncate() operations on the NFS client side.
1534 * Truncation creates a number of special problems for NFS. We have to
1535 * throw away VM pages and buffer cache buffers that are beyond EOF, and
1536 * we have to properly handle VM pages or (potentially dirty) buffers
1537 * that straddle the truncation point.
1541 nfs_meta_setsize(struct vnode
*vp
, struct thread
*td
, u_quad_t nsize
)
1543 struct nfsnode
*np
= VTONFS(vp
);
1544 u_quad_t tsize
= np
->n_size
;
1545 int biosize
= vp
->v_mount
->mnt_stat
.f_iosize
;
1550 if (np
->n_size
< tsize
) {
1557 * vtruncbuf() doesn't get the buffer overlapping the
1558 * truncation point. We may have a B_DELWRI and/or B_CACHE
1559 * buffer that now needs to be truncated.
1561 error
= vtruncbuf(vp
, nsize
, biosize
);
1562 lbn
= nsize
/ biosize
;
1563 bufsize
= nsize
& (biosize
- 1);
1564 loffset
= nsize
- bufsize
;
1565 bp
= nfs_getcacheblk(vp
, loffset
, bufsize
, td
);
1566 if (bp
->b_dirtyoff
> bp
->b_bcount
)
1567 bp
->b_dirtyoff
= bp
->b_bcount
;
1568 if (bp
->b_dirtyend
> bp
->b_bcount
)
1569 bp
->b_dirtyend
= bp
->b_bcount
;
1570 bp
->b_flags
|= B_RELBUF
; /* don't leave garbage around */
1573 vnode_pager_setsize(vp
, nsize
);
1579 * Synchronous completion for nfs_doio. Call bpdone() with elseit=FALSE.
1580 * Caller is responsible for brelse()'ing the bp.
1583 nfsiodone_sync(struct bio
*bio
)
1586 bpdone(bio
->bio_buf
, 0);
1590 * nfs read rpc - BIO version
1593 nfs_readrpc_bio(struct vnode
*vp
, struct bio
*bio
)
1595 struct buf
*bp
= bio
->bio_buf
;
1597 struct nfsmount
*nmp
;
1598 int error
= 0, len
, tsiz
;
1599 struct nfsm_info
*info
;
1601 info
= kmalloc(sizeof(*info
), M_NFSREQ
, M_WAITOK
);
1603 info
->v3
= NFS_ISV3(vp
);
1605 nmp
= VFSTONFS(vp
->v_mount
);
1606 tsiz
= bp
->b_bcount
;
1607 KKASSERT(tsiz
<= nmp
->nm_rsize
);
1608 if (bio
->bio_offset
+ tsiz
> nmp
->nm_maxfilesize
) {
1612 nfsstats
.rpccnt
[NFSPROC_READ
]++;
1614 nfsm_reqhead(info
, vp
, NFSPROC_READ
,
1615 NFSX_FH(info
->v3
) + NFSX_UNSIGNED
* 3);
1616 ERROROUT(nfsm_fhtom(info
, vp
));
1617 tl
= nfsm_build(info
, NFSX_UNSIGNED
* 3);
1619 txdr_hyper(bio
->bio_offset
, tl
);
1620 *(tl
+ 2) = txdr_unsigned(len
);
1622 *tl
++ = txdr_unsigned(bio
->bio_offset
);
1623 *tl
++ = txdr_unsigned(len
);
1627 info
->done
= nfs_readrpc_bio_done
;
1628 nfsm_request_bio(info
, vp
, NFSPROC_READ
, NULL
,
1629 nfs_vpcred(vp
, ND_READ
));
1632 kfree(info
, M_NFSREQ
);
1633 bp
->b_error
= error
;
1634 bp
->b_flags
|= B_ERROR
;
1639 nfs_readrpc_bio_done(nfsm_info_t info
)
1641 struct nfsmount
*nmp
= VFSTONFS(info
->vp
->v_mount
);
1642 struct bio
*bio
= info
->bio
;
1643 struct buf
*bp
= bio
->bio_buf
;
1650 KKASSERT(info
->state
== NFSM_STATE_DONE
);
1653 ERROROUT(nfsm_postop_attr(info
, info
->vp
, &attrflag
,
1654 NFS_LATTR_NOSHRINK
));
1655 NULLOUT(tl
= nfsm_dissect(info
, 2 * NFSX_UNSIGNED
));
1656 eof
= fxdr_unsigned(int, *(tl
+ 1));
1658 ERROROUT(nfsm_loadattr(info
, info
->vp
, NULL
));
1661 NEGATIVEOUT(retlen
= nfsm_strsiz(info
, nmp
->nm_rsize
));
1662 ERROROUT(nfsm_mtobio(info
, bio
, retlen
));
1663 m_freem(info
->mrep
);
1667 * No error occured, fill the hole if any
1669 if (retlen
< bp
->b_bcount
) {
1670 bzero(bp
->b_data
+ retlen
, bp
->b_bcount
- retlen
);
1672 bp
->b_resid
= bp
->b_bcount
- retlen
;
1677 if (eof
|| retlen
== 0) {
1680 } else if (retlen
< len
) {
1685 kfree(info
, M_NFSREQ
);
1687 bp
->b_error
= error
;
1688 bp
->b_flags
|= B_ERROR
;
1694 * nfs write call - BIO version
1697 nfs_writerpc_bio(struct vnode
*vp
, struct bio
*bio
)
1699 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
1700 struct nfsnode
*np
= VTONFS(vp
);
1701 struct buf
*bp
= bio
->bio_buf
;
1706 struct nfsm_info
*info
;
1710 * Setup for actual write. Just clean up the bio if there
1713 if (bio
->bio_offset
+ bp
->b_dirtyend
> np
->n_size
)
1714 bp
->b_dirtyend
= np
->n_size
- bio
->bio_offset
;
1716 if (bp
->b_dirtyend
<= bp
->b_dirtyoff
) {
1721 len
= bp
->b_dirtyend
- bp
->b_dirtyoff
;
1722 offset
= bio
->bio_offset
+ bp
->b_dirtyoff
;
1723 if (offset
+ len
> nmp
->nm_maxfilesize
) {
1724 bp
->b_flags
|= B_ERROR
;
1725 bp
->b_error
= EFBIG
;
1730 nfsstats
.write_bios
++;
1732 info
= kmalloc(sizeof(*info
), M_NFSREQ
, M_WAITOK
);
1734 info
->v3
= NFS_ISV3(vp
);
1735 info
->info_writerpc
.must_commit
= 0;
1736 if ((bp
->b_flags
& (B_NEEDCOMMIT
| B_NOCACHE
| B_CLUSTER
)) == 0)
1737 iomode
= NFSV3WRITE_UNSTABLE
;
1739 iomode
= NFSV3WRITE_FILESYNC
;
1741 KKASSERT(len
<= nmp
->nm_wsize
);
1743 nfsstats
.rpccnt
[NFSPROC_WRITE
]++;
1744 nfsm_reqhead(info
, vp
, NFSPROC_WRITE
,
1745 NFSX_FH(info
->v3
) + 5 * NFSX_UNSIGNED
+ nfsm_rndup(len
));
1746 ERROROUT(nfsm_fhtom(info
, vp
));
1748 tl
= nfsm_build(info
, 5 * NFSX_UNSIGNED
);
1749 txdr_hyper(offset
, tl
);
1751 *tl
++ = txdr_unsigned(len
);
1752 *tl
++ = txdr_unsigned(iomode
);
1753 *tl
= txdr_unsigned(len
);
1757 tl
= nfsm_build(info
, 4 * NFSX_UNSIGNED
);
1758 /* Set both "begin" and "current" to non-garbage. */
1759 x
= txdr_unsigned((u_int32_t
)offset
);
1760 *tl
++ = x
; /* "begin offset" */
1761 *tl
++ = x
; /* "current offset" */
1762 x
= txdr_unsigned(len
);
1763 *tl
++ = x
; /* total to this offset */
1764 *tl
= x
; /* size of this write */
1766 ERROROUT(nfsm_biotom(info
, bio
, bp
->b_dirtyoff
, len
));
1768 info
->done
= nfs_writerpc_bio_done
;
1769 nfsm_request_bio(info
, vp
, NFSPROC_WRITE
, NULL
,
1770 nfs_vpcred(vp
, ND_WRITE
));
1773 kfree(info
, M_NFSREQ
);
1774 bp
->b_error
= error
;
1775 bp
->b_flags
|= B_ERROR
;
1780 nfs_writerpc_bio_done(nfsm_info_t info
)
1782 struct nfsmount
*nmp
= VFSTONFS(info
->vp
->v_mount
);
1783 struct nfsnode
*np
= VTONFS(info
->vp
);
1784 struct bio
*bio
= info
->bio
;
1785 struct buf
*bp
= bio
->bio_buf
;
1786 int wccflag
= NFSV3_WCCRATTR
;
1787 int iomode
= NFSV3WRITE_FILESYNC
;
1791 int len
= bp
->b_resid
; /* b_resid was set to shortened length */
1796 * The write RPC returns a before and after mtime. The
1797 * nfsm_wcc_data() macro checks the before n_mtime
1798 * against the before time and stores the after time
1799 * in the nfsnode's cached vattr and n_mtime field.
1800 * The NRMODIFIED bit will be set if the before
1801 * time did not match the original mtime.
1803 wccflag
= NFSV3_WCCCHK
;
1804 ERROROUT(nfsm_wcc_data(info
, info
->vp
, &wccflag
));
1806 NULLOUT(tl
= nfsm_dissect(info
, 2 * NFSX_UNSIGNED
+ NFSX_V3WRITEVERF
));
1807 rlen
= fxdr_unsigned(int, *tl
++);
1810 m_freem(info
->mrep
);
1813 } else if (rlen
< len
) {
1816 * XXX what do we do here?
1818 backup
= len
- rlen
;
1819 uiop
->uio_iov
->iov_base
= (char *)uiop
->uio_iov
->iov_base
- backup
;
1820 uiop
->uio_iov
->iov_len
+= backup
;
1821 uiop
->uio_offset
-= backup
;
1822 uiop
->uio_resid
+= backup
;
1826 commit
= fxdr_unsigned(int, *tl
++);
1829 * Return the lowest committment level
1830 * obtained by any of the RPCs.
1832 if (iomode
== NFSV3WRITE_FILESYNC
)
1834 else if (iomode
== NFSV3WRITE_DATASYNC
&&
1835 commit
== NFSV3WRITE_UNSTABLE
)
1837 if ((nmp
->nm_state
& NFSSTA_HASWRITEVERF
) == 0){
1838 bcopy(tl
, (caddr_t
)nmp
->nm_verf
, NFSX_V3WRITEVERF
);
1839 nmp
->nm_state
|= NFSSTA_HASWRITEVERF
;
1840 } else if (bcmp(tl
, nmp
->nm_verf
, NFSX_V3WRITEVERF
)) {
1841 info
->info_writerpc
.must_commit
= 1;
1842 bcopy(tl
, (caddr_t
)nmp
->nm_verf
, NFSX_V3WRITEVERF
);
1846 ERROROUT(nfsm_loadattr(info
, info
->vp
, NULL
));
1848 m_freem(info
->mrep
);
1852 if (info
->vp
->v_mount
->mnt_flag
& MNT_ASYNC
)
1853 iomode
= NFSV3WRITE_FILESYNC
;
1857 * End of RPC. Now clean up the bp.
1859 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1860 * to cluster the buffers needing commit. This will allow
1861 * the system to submit a single commit rpc for the whole
1862 * cluster. We can do this even if the buffer is not 100%
1863 * dirty (relative to the NFS blocksize), so we optimize the
1864 * append-to-file-case.
1866 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1867 * cleared because write clustering only works for commit
1868 * rpc's, not for the data portion of the write).
1870 if (!error
&& iomode
== NFSV3WRITE_UNSTABLE
) {
1871 bp
->b_flags
|= B_NEEDCOMMIT
;
1872 if (bp
->b_dirtyoff
== 0 && bp
->b_dirtyend
== bp
->b_bcount
)
1873 bp
->b_flags
|= B_CLUSTEROK
;
1875 bp
->b_flags
&= ~(B_NEEDCOMMIT
| B_CLUSTEROK
);
1879 * For an interrupted write, the buffer is still valid
1880 * and the write hasn't been pushed to the server yet,
1881 * so we can't set B_ERROR and report the interruption
1882 * by setting B_EINTR. For the async case, B_EINTR
1883 * is not relevant, so the rpc attempt is essentially
1884 * a noop. For the case of a V3 write rpc not being
1885 * committed to stable storage, the block is still
1886 * dirty and requires either a commit rpc or another
1887 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1888 * the block is reused. This is indicated by setting
1889 * the B_DELWRI and B_NEEDCOMMIT flags.
1891 * If the buffer is marked B_PAGING, it does not reside on
1892 * the vp's paging queues so we cannot call bdirty(). The
1893 * bp in this case is not an NFS cache block so we should
1896 if (error
== EINTR
|| (!error
&& (bp
->b_flags
& B_NEEDCOMMIT
))) {
1898 bp
->b_flags
&= ~(B_INVAL
|B_NOCACHE
);
1899 if ((bp
->b_flags
& B_PAGING
) == 0)
1902 bp
->b_flags
|= B_EINTR
;
1906 bp
->b_flags
|= B_ERROR
;
1907 bp
->b_error
= np
->n_error
= error
;
1908 np
->n_flag
|= NWRITEERR
;
1910 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1912 if (info
->info_writerpc
.must_commit
)
1913 nfs_clearcommit(info
->vp
->v_mount
);
1914 kfree(info
, M_NFSREQ
);
1916 bp
->b_flags
|= B_ERROR
;
1917 bp
->b_error
= error
;
1923 * Nfs Version 3 commit rpc - BIO version
1925 * This function issues the commit rpc and will chain to a write
1929 nfs_commitrpc_bio(struct vnode
*vp
, struct bio
*bio
)
1931 struct nfsmount
*nmp
= VFSTONFS(vp
->v_mount
);
1932 struct buf
*bp
= bio
->bio_buf
;
1933 struct nfsm_info
*info
;
1937 if ((nmp
->nm_state
& NFSSTA_HASWRITEVERF
) == 0) {
1938 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1939 bp
->b_flags
&= ~(B_NEEDCOMMIT
| B_CLUSTEROK
);
1945 info
= kmalloc(sizeof(*info
), M_NFSREQ
, M_WAITOK
);
1949 nfsstats
.rpccnt
[NFSPROC_COMMIT
]++;
1950 nfsm_reqhead(info
, vp
, NFSPROC_COMMIT
, NFSX_FH(1));
1951 ERROROUT(nfsm_fhtom(info
, vp
));
1952 tl
= nfsm_build(info
, 3 * NFSX_UNSIGNED
);
1953 txdr_hyper(bio
->bio_offset
+ bp
->b_dirtyoff
, tl
);
1955 *tl
= txdr_unsigned(bp
->b_dirtyend
- bp
->b_dirtyoff
);
1957 info
->done
= nfs_commitrpc_bio_done
;
1958 nfsm_request_bio(info
, vp
, NFSPROC_COMMIT
, NULL
,
1959 nfs_vpcred(vp
, ND_WRITE
));
1963 * Chain to write RPC on (early) error
1965 kfree(info
, M_NFSREQ
);
1966 nfs_writerpc_bio(vp
, bio
);
1970 nfs_commitrpc_bio_done(nfsm_info_t info
)
1972 struct nfsmount
*nmp
= VFSTONFS(info
->vp
->v_mount
);
1973 struct bio
*bio
= info
->bio
;
1974 struct buf
*bp
= bio
->bio_buf
;
1976 int wccflag
= NFSV3_WCCRATTR
;
1979 ERROROUT(nfsm_wcc_data(info
, info
->vp
, &wccflag
));
1981 NULLOUT(tl
= nfsm_dissect(info
, NFSX_V3WRITEVERF
));
1982 if (bcmp(nmp
->nm_verf
, tl
, NFSX_V3WRITEVERF
)) {
1983 bcopy(tl
, nmp
->nm_verf
, NFSX_V3WRITEVERF
);
1984 error
= NFSERR_STALEWRITEVERF
;
1987 m_freem(info
->mrep
);
1991 * On completion we must chain to a write bio if an
1995 kfree(info
, M_NFSREQ
);
1997 bp
->b_dirtyoff
= bp
->b_dirtyend
= 0;
1998 bp
->b_flags
&= ~(B_NEEDCOMMIT
| B_CLUSTEROK
);
2002 kprintf("commitrpc_bioC %lld -> CHAIN WRITE\n", bio
->bio_offset
);
2003 nfs_writerpc_bio(info
->vp
, bio
);