sys/vfs/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * Rick Macklem at The University of Guelph.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. All advertising materials mentioning features or use of this software
  17  *    must display the following acknowledgement:
  18  *      This product includes software developed by the University of
  19  *      California, Berkeley and its contributors.
  20  * 4. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  37  * $FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_bio.c,v 1.130 2004/04/14 23:23:55 peadar Exp $
  38  * $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.45 2008/07/18 00:09:39 dillon Exp $
  39  */
  40
  41
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/resourcevar.h>
  45 #include <sys/signalvar.h>
  46 #include <sys/proc.h>
  47 #include <sys/buf.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mount.h>
  50 #include <sys/kernel.h>
  51 #include <sys/mbuf.h>
  52 #include <sys/msfbuf.h>
  53
  54 #include <vm/vm.h>
  55 #include <vm/vm_extern.h>
  56 #include <vm/vm_page.h>
  57 #include <vm/vm_object.h>
  58 #include <vm/vm_pager.h>
  59 #include <vm/vnode_pager.h>
  60
  61 #include <sys/buf2.h>
  62 #include <sys/thread2.h>
  63 #include <vm/vm_page2.h>
  64
  65 #include "rpcv2.h"
  66 #include "nfsproto.h"
  67 #include "nfs.h"
  68 #include "nfsmount.h"
  69 #include "nfsnode.h"
  70 #include "xdr_subs.h"
  71 #include "nfsm_subs.h"
  72
  73
  74 static struct buf *nfs_getcacheblk(struct vnode *vp, off_t loffset,
  75                                    int size, struct thread *td);
  76 static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen);
  77 static void nfsiodone_sync(struct bio *bio);
  78 static void nfs_readrpc_bio_done(nfsm_info_t info);
  79 static void nfs_writerpc_bio_done(nfsm_info_t info);
  80 static void nfs_commitrpc_bio_done(nfsm_info_t info);
  81
  82 /*
  83  * Vnode op for read using bio
  84  */
  85 int
  86 nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag)
  87 {
  88         struct nfsnode *np = VTONFS(vp);
  89         int biosize, i;
  90         struct buf *bp, *rabp;
  91         struct vattr vattr;
  92         struct thread *td;
  93         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
  94         off_t lbn, rabn;
  95         off_t raoffset;
  96         off_t loffset;
  97         int seqcount;
  98         int nra, error = 0;
  99         int boff = 0;
 100         size_t n;
 101
 102 #ifdef DIAGNOSTIC
 103         if (uio->uio_rw != UIO_READ)
 104                 panic("nfs_read mode");
 105 #endif
 106         if (uio->uio_resid == 0)
 107                 return (0);
 108         if (uio->uio_offset < 0)        /* XXX VDIR cookies can be negative */
 109                 return (EINVAL);
 110         td = uio->uio_td;
 111
 112         if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 113             (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 114                 (void)nfs_fsinfo(nmp, vp, td);
 115         if (vp->v_type != VDIR &&
 116             (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 117                 return (EFBIG);
 118         biosize = vp->v_mount->mnt_stat.f_iosize;
 119         seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
 120
 121         /*
 122          * For nfs, cache consistency can only be maintained approximately.
 123          * Although RFC1094 does not specify the criteria, the following is
 124          * believed to be compatible with the reference port.
 125          *
 126          * NFS:         If local changes have been made and this is a
 127          *              directory, the directory must be invalidated and
 128          *              the attribute cache must be cleared.
 129          *
 130          *              GETATTR is called to synchronize the file size.
 131          *
 132          *              If remote changes are detected local data is flushed
 133          *              and the cache is invalidated.
 134          *
 135          *              NOTE: In the normal case the attribute cache is not
 136          *              cleared which means GETATTR may use cached data and
 137          *              not immediately detect changes made on the server.
 138          */
 139         if ((np->n_flag & NLMODIFIED) && vp->v_type == VDIR) {
 140                 nfs_invaldir(vp);
 141                 error = nfs_vinvalbuf(vp, V_SAVE, 1);
 142                 if (error)
 143                         return (error);
 144                 np->n_attrstamp = 0;
 145         }
 146         error = VOP_GETATTR(vp, &vattr);
 147         if (error)
 148                 return (error);
 149
 150         /*
 151          * This can deadlock getpages/putpages for regular
 152          * files.  Only do it for directories.
 153          */
 154         if (np->n_flag & NRMODIFIED) {
 155                 if (vp->v_type == VDIR) {
 156                         nfs_invaldir(vp);
 157                         error = nfs_vinvalbuf(vp, V_SAVE, 1);
 158                         if (error)
 159                                 return (error);
 160                         np->n_flag &= ~NRMODIFIED;
 161                 }
 162         }
 163
 164         /*
 165          * Loop until uio exhausted or we hit EOF
 166          */
 167         do {
 168             bp = NULL;
 169
 170             switch (vp->v_type) {
 171             case VREG:
 172                 nfsstats.biocache_reads++;
 173                 lbn = uio->uio_offset / biosize;
 174                 boff = uio->uio_offset & (biosize - 1);
 175                 loffset = (off_t)lbn * biosize;
 176
 177                 /*
 178                  * Start the read ahead(s), as required.
 179                  */
 180                 if (nmp->nm_readahead > 0 && nfs_asyncok(nmp)) {
 181                     for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
 182                         (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
 183                         rabn = lbn + 1 + nra;
 184                         raoffset = (off_t)rabn * biosize;
 185                         if (findblk(vp, raoffset, FINDBLK_TEST) == NULL) {
 186                             rabp = nfs_getcacheblk(vp, raoffset, biosize, td);
 187                             if (!rabp)
 188                                 return (EINTR);
 189                             if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 190                                 rabp->b_cmd = BUF_CMD_READ;
 191                                 vfs_busy_pages(vp, rabp);
 192                                 nfs_asyncio(vp, &rabp->b_bio2);
 193                             } else {
 194                                 brelse(rabp);
 195                             }
 196                         }
 197                     }
 198                 }
 199
 200                 /*
 201                  * Obtain the buffer cache block.  Figure out the buffer size
 202                  * when we are at EOF.  If we are modifying the size of the
 203                  * buffer based on an EOF condition we need to hold
 204                  * nfs_rslock() through obtaining the buffer to prevent
 205                  * a potential writer-appender from messing with n_size.
 206                  * Otherwise we may accidently truncate the buffer and
 207                  * lose dirty data.
 208                  *
 209                  * Note that bcount is *not* DEV_BSIZE aligned.
 210                  */
 211                 if (loffset + boff >= np->n_size) {
 212                         n = 0;
 213                         break;
 214                 }
 215                 bp = nfs_getcacheblk(vp, loffset, biosize, td);
 216
 217                 if (bp == NULL)
 218                         return (EINTR);
 219
 220                 /*
 221                  * If B_CACHE is not set, we must issue the read.  If this
 222                  * fails, we return an error.
 223                  */
 224                 if ((bp->b_flags & B_CACHE) == 0) {
 225                         bp->b_cmd = BUF_CMD_READ;
 226                         bp->b_bio2.bio_done = nfsiodone_sync;
 227                         bp->b_bio2.bio_flags |= BIO_SYNC;
 228                         vfs_busy_pages(vp, bp);
 229                         error = nfs_doio(vp, &bp->b_bio2, td);
 230                         if (error) {
 231                                 brelse(bp);
 232                                 return (error);
 233                         }
 234                 }
 235
 236                 /*
 237                  * on is the offset into the current bp.  Figure out how many
 238                  * bytes we can copy out of the bp.  Note that bcount is
 239                  * NOT DEV_BSIZE aligned.
 240                  *
 241                  * Then figure out how many bytes we can copy into the uio.
 242                  */
 243                 n = biosize - boff;
 244                 if (n > uio->uio_resid)
 245                         n = uio->uio_resid;
 246                 if (loffset + boff + n > np->n_size)
 247                         n = np->n_size - loffset - boff;
 248                 break;
 249             case VLNK:
 250                 biosize = min(NFS_MAXPATHLEN, np->n_size);
 251                 nfsstats.biocache_readlinks++;
 252                 bp = nfs_getcacheblk(vp, (off_t)0, biosize, td);
 253                 if (bp == NULL)
 254                         return (EINTR);
 255                 if ((bp->b_flags & B_CACHE) == 0) {
 256                         bp->b_cmd = BUF_CMD_READ;
 257                         bp->b_bio2.bio_done = nfsiodone_sync;
 258                         bp->b_bio2.bio_flags |= BIO_SYNC;
 259                         vfs_busy_pages(vp, bp);
 260                         error = nfs_doio(vp, &bp->b_bio2, td);
 261                         if (error) {
 262                                 bp->b_flags |= B_ERROR | B_INVAL;
 263                                 brelse(bp);
 264                                 return (error);
 265                         }
 266                 }
 267                 n = szmin(uio->uio_resid, (size_t)bp->b_bcount - bp->b_resid);
 268                 boff = 0;
 269                 break;
 270             case VDIR:
 271                 nfsstats.biocache_readdirs++;
 272                 if (np->n_direofoffset &&
 273                     uio->uio_offset >= np->n_direofoffset
 274                 ) {
 275                         return (0);
 276                 }
 277                 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
 278                 boff = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 279                 loffset = uio->uio_offset - boff;
 280                 bp = nfs_getcacheblk(vp, loffset, NFS_DIRBLKSIZ, td);
 281                 if (bp == NULL)
 282                         return (EINTR);
 283
 284                 if ((bp->b_flags & B_CACHE) == 0) {
 285                     bp->b_cmd = BUF_CMD_READ;
 286                     bp->b_bio2.bio_done = nfsiodone_sync;
 287                     bp->b_bio2.bio_flags |= BIO_SYNC;
 288                     vfs_busy_pages(vp, bp);
 289                     error = nfs_doio(vp, &bp->b_bio2, td);
 290                     if (error)
 291                             brelse(bp);
 292                     while (error == NFSERR_BAD_COOKIE) {
 293                         kprintf("got bad cookie vp %p bp %p\n", vp, bp);
 294                         nfs_invaldir(vp);
 295                         error = nfs_vinvalbuf(vp, 0, 1);
 296                         /*
 297                          * Yuck! The directory has been modified on the
 298                          * server. The only way to get the block is by
 299                          * reading from the beginning to get all the
 300                          * offset cookies.
 301                          *
 302                          * Leave the last bp intact unless there is an error.
 303                          * Loop back up to the while if the error is another
 304                          * NFSERR_BAD_COOKIE (double yuch!).
 305                          */
 306                         for (i = 0; i <= lbn && !error; i++) {
 307                             if (np->n_direofoffset
 308                                 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 309                                     return (0);
 310                             bp = nfs_getcacheblk(vp, (off_t)i * NFS_DIRBLKSIZ,
 311                                                  NFS_DIRBLKSIZ, td);
 312                             if (!bp)
 313                                 return (EINTR);
 314                             if ((bp->b_flags & B_CACHE) == 0) {
 315                                     bp->b_cmd = BUF_CMD_READ;
 316                                     bp->b_bio2.bio_done = nfsiodone_sync;
 317                                     bp->b_bio2.bio_flags |= BIO_SYNC;
 318                                     vfs_busy_pages(vp, bp);
 319                                     error = nfs_doio(vp, &bp->b_bio2, td);
 320                                     /*
 321                                      * no error + B_INVAL == directory EOF,
 322                                      * use the block.
 323                                      */
 324                                     if (error == 0 && (bp->b_flags & B_INVAL))
 325                                             break;
 326                             }
 327                             /*
 328                              * An error will throw away the block and the
 329                              * for loop will break out.  If no error and this
 330                              * is not the block we want, we throw away the
 331                              * block and go for the next one via the for loop.
 332                              */
 333                             if (error || i < lbn)
 334                                     brelse(bp);
 335                         }
 336                     }
 337                     /*
 338                      * The above while is repeated if we hit another cookie
 339                      * error.  If we hit an error and it wasn't a cookie error,
 340                      * we give up.
 341                      */
 342                     if (error)
 343                             return (error);
 344                 }
 345
 346                 /*
 347                  * If not eof and read aheads are enabled, start one.
 348                  * (You need the current block first, so that you have the
 349                  *  directory offset cookie of the next block.)
 350                  */
 351                 if (nmp->nm_readahead > 0 && nfs_asyncok(nmp) &&
 352                     (bp->b_flags & B_INVAL) == 0 &&
 353                     (np->n_direofoffset == 0 ||
 354                     loffset + NFS_DIRBLKSIZ < np->n_direofoffset) &&
 355                     findblk(vp, loffset + NFS_DIRBLKSIZ, FINDBLK_TEST) == NULL
 356                 ) {
 357                         rabp = nfs_getcacheblk(vp, loffset + NFS_DIRBLKSIZ,
 358                                                NFS_DIRBLKSIZ, td);
 359                         if (rabp) {
 360                             if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 361                                 rabp->b_cmd = BUF_CMD_READ;
 362                                 vfs_busy_pages(vp, rabp);
 363                                 nfs_asyncio(vp, &rabp->b_bio2);
 364                             } else {
 365                                 brelse(rabp);
 366                             }
 367                         }
 368                 }
 369                 /*
 370                  * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 371                  * chopped for the EOF condition, we cannot tell how large
 372                  * NFS directories are going to be until we hit EOF.  So
 373                  * an NFS directory buffer is *not* chopped to its EOF.  Now,
 374                  * it just so happens that b_resid will effectively chop it
 375                  * to EOF.  *BUT* this information is lost if the buffer goes
 376                  * away and is reconstituted into a B_CACHE state ( due to
 377                  * being VMIO ) later.  So we keep track of the directory eof
 378                  * in np->n_direofoffset and chop it off as an extra step
 379                  * right here.
 380                  *
 381                  * NOTE: boff could already be beyond EOF.
 382                  */
 383                 if ((size_t)boff > NFS_DIRBLKSIZ - bp->b_resid) {
 384                         n = 0;
 385                 } else {
 386                         n = szmin(uio->uio_resid,
 387                                   NFS_DIRBLKSIZ - bp->b_resid - (size_t)boff);
 388                 }
 389                 if (np->n_direofoffset &&
 390                     n > (size_t)(np->n_direofoffset - uio->uio_offset)) {
 391                         n = (size_t)(np->n_direofoffset - uio->uio_offset);
 392                 }
 393                 break;
 394             default:
 395                 kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 396                 n = 0;
 397                 break;
 398             };
 399
 400             switch (vp->v_type) {
 401             case VREG:
 402                 if (n > 0)
 403                     error = uiomove(bp->b_data + boff, n, uio);
 404                 break;
 405             case VLNK:
 406                 if (n > 0)
 407                     error = uiomove(bp->b_data + boff, n, uio);
 408                 n = 0;
 409                 break;
 410             case VDIR:
 411                 if (n > 0) {
 412                     off_t old_off = uio->uio_offset;
 413                     caddr_t cpos, epos;
 414                     struct nfs_dirent *dp;
 415
 416                     /*
 417                      * We are casting cpos to nfs_dirent, it must be
 418                      * int-aligned.
 419                      */
 420                     if (boff & 3) {
 421                         error = EINVAL;
 422                         break;
 423                     }
 424
 425                     cpos = bp->b_data + boff;
 426                     epos = bp->b_data + boff + n;
 427                     while (cpos < epos && error == 0 && uio->uio_resid > 0) {
 428                             dp = (struct nfs_dirent *)cpos;
 429                             error = nfs_check_dirent(dp, (int)(epos - cpos));
 430                             if (error)
 431                                     break;
 432                             if (vop_write_dirent(&error, uio, dp->nfs_ino,
 433                                 dp->nfs_type, dp->nfs_namlen, dp->nfs_name)) {
 434                                     break;
 435                             }
 436                             cpos += dp->nfs_reclen;
 437                     }
 438                     n = 0;
 439                     if (error == 0) {
 440                             uio->uio_offset = old_off + cpos -
 441                                               bp->b_data - boff;
 442                     }
 443                 }
 444                 break;
 445             default:
 446                 kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 447             }
 448             if (bp)
 449                     brelse(bp);
 450         } while (error == 0 && uio->uio_resid > 0 && n > 0);
 451         return (error);
 452 }
 453
 454 /*
 455  * Userland can supply any 'seek' offset when reading a NFS directory.
 456  * Validate the structure so we don't panic the kernel.  Note that
 457  * the element name is nul terminated and the nul is not included
 458  * in nfs_namlen.
 459  */
 460 static
 461 int
 462 nfs_check_dirent(struct nfs_dirent *dp, int maxlen)
 463 {
 464         int nfs_name_off = offsetof(struct nfs_dirent, nfs_name[0]);
 465
 466         if (nfs_name_off >= maxlen)
 467                 return (EINVAL);
 468         if (dp->nfs_reclen < nfs_name_off || dp->nfs_reclen > maxlen)
 469                 return (EINVAL);
 470         if (nfs_name_off + dp->nfs_namlen >= dp->nfs_reclen)
 471                 return (EINVAL);
 472         if (dp->nfs_reclen & 3)
 473                 return (EINVAL);
 474         return (0);
 475 }
 476
 477 /*
 478  * Vnode op for write using bio
 479  *
 480  * nfs_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag,
 481  *           struct ucred *a_cred)
 482  */
 483 int
 484 nfs_write(struct vop_write_args *ap)
 485 {
 486         struct uio *uio = ap->a_uio;
 487         struct thread *td = uio->uio_td;
 488         struct vnode *vp = ap->a_vp;
 489         struct nfsnode *np = VTONFS(vp);
 490         int ioflag = ap->a_ioflag;
 491         struct buf *bp;
 492         struct vattr vattr;
 493         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 494         off_t loffset;
 495         int boff, bytes;
 496         int error = 0;
 497         int haverslock = 0;
 498         int bcount;
 499         int biosize;
 500         int trivial;
 501
 502 #ifdef DIAGNOSTIC
 503         if (uio->uio_rw != UIO_WRITE)
 504                 panic("nfs_write mode");
 505         if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
 506                 panic("nfs_write proc");
 507 #endif
 508         if (vp->v_type != VREG)
 509                 return (EIO);
 510         if (np->n_flag & NWRITEERR) {
 511                 np->n_flag &= ~NWRITEERR;
 512                 return (np->n_error);
 513         }
 514         if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 515             (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 516                 (void)nfs_fsinfo(nmp, vp, td);
 517
 518         /*
 519          * Synchronously flush pending buffers if we are in synchronous
 520          * mode or if we are appending.
 521          */
 522         if (ioflag & (IO_APPEND | IO_SYNC)) {
 523                 if (np->n_flag & NLMODIFIED) {
 524                         np->n_attrstamp = 0;
 525                         error = nfs_flush(vp, MNT_WAIT, td, 0);
 526                         /* error = nfs_vinvalbuf(vp, V_SAVE, 1); */
 527                         if (error)
 528                                 return (error);
 529                 }
 530         }
 531
 532         /*
 533          * If IO_APPEND then load uio_offset.  We restart here if we cannot
 534          * get the append lock.
 535          */
 536 restart:
 537         if (ioflag & IO_APPEND) {
 538                 np->n_attrstamp = 0;
 539                 error = VOP_GETATTR(vp, &vattr);
 540                 if (error)
 541                         return (error);
 542                 uio->uio_offset = np->n_size;
 543         }
 544
 545         if (uio->uio_offset < 0)
 546                 return (EINVAL);
 547         if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 548                 return (EFBIG);
 549         if (uio->uio_resid == 0)
 550                 return (0);
 551
 552         /*
 553          * We need to obtain the rslock if we intend to modify np->n_size
 554          * in order to guarentee the append point with multiple contending
 555          * writers, to guarentee that no other appenders modify n_size
 556          * while we are trying to obtain a truncated buffer (i.e. to avoid
 557          * accidently truncating data written by another appender due to
 558          * the race), and to ensure that the buffer is populated prior to
 559          * our extending of the file.  We hold rslock through the entire
 560          * operation.
 561          *
 562          * Note that we do not synchronize the case where someone truncates
 563          * the file while we are appending to it because attempting to lock
 564          * this case may deadlock other parts of the system unexpectedly.
 565          */
 566         if ((ioflag & IO_APPEND) ||
 567             uio->uio_offset + uio->uio_resid > np->n_size) {
 568                 switch(nfs_rslock(np)) {
 569                 case ENOLCK:
 570                         goto restart;
 571                         /* not reached */
 572                 case EINTR:
 573                 case ERESTART:
 574                         return(EINTR);
 575                         /* not reached */
 576                 default:
 577                         break;
 578                 }
 579                 haverslock = 1;
 580         }
 581
 582         /*
 583          * Maybe this should be above the vnode op call, but so long as
 584          * file servers have no limits, i don't think it matters
 585          */
 586         if (td && td->td_proc && uio->uio_offset + uio->uio_resid >
 587               td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 588                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
 589                 if (haverslock)
 590                         nfs_rsunlock(np);
 591                 return (EFBIG);
 592         }
 593
 594         biosize = vp->v_mount->mnt_stat.f_iosize;
 595
 596         do {
 597                 nfsstats.biocache_writes++;
 598                 boff = uio->uio_offset & (biosize-1);
 599                 loffset = uio->uio_offset - boff;
 600                 bytes = (int)szmin((unsigned)(biosize - boff), uio->uio_resid);
 601 again:
 602                 /*
 603                  * Handle direct append and file extension cases, calculate
 604                  * unaligned buffer size.  When extending B_CACHE will be
 605                  * set if possible.  See UIO_NOCOPY note below.
 606                  */
 607                 if (uio->uio_offset + bytes > np->n_size) {
 608                         np->n_flag |= NLMODIFIED;
 609                         trivial = (uio->uio_segflg != UIO_NOCOPY &&
 610                                    uio->uio_offset <= np->n_size);
 611                         nfs_meta_setsize(vp, td, uio->uio_offset + bytes,
 612                                          trivial);
 613                 }
 614                 bp = nfs_getcacheblk(vp, loffset, biosize, td);
 615                 if (bp == NULL) {
 616                         error = EINTR;
 617                         break;
 618                 }
 619
 620                 /*
 621                  * Actual bytes in buffer which we care about
 622                  */
 623                 if (loffset + biosize < np->n_size)
 624                         bcount = biosize;
 625                 else
 626                         bcount = (int)(np->n_size - loffset);
 627
 628                 /*
 629                  * Avoid a read by setting B_CACHE where the data we
 630                  * intend to write covers the entire buffer.  Note
 631                  * that the buffer may have been set to B_CACHE by
 632                  * nfs_meta_setsize() above or otherwise inherited the
 633                  * flag, but if B_CACHE isn't set the buffer may be
 634                  * uninitialized and must be zero'd to accomodate
 635                  * future seek+write's.
 636                  *
 637                  * See the comments in kern/vfs_bio.c's getblk() for
 638                  * more information.
 639                  *
 640                  * When doing a UIO_NOCOPY write the buffer is not
 641                  * overwritten and we cannot just set B_CACHE unconditionally
 642                  * for full-block writes.
 643                  */
 644                 if (boff == 0 && bytes == biosize &&
 645                     uio->uio_segflg != UIO_NOCOPY) {
 646                         bp->b_flags |= B_CACHE;
 647                         bp->b_flags &= ~(B_ERROR | B_INVAL);
 648                 }
 649
 650                 /*
 651                  * b_resid may be set due to file EOF if we extended out.
 652                  * The NFS bio code will zero the difference anyway so
 653                  * just acknowledged the fact and set b_resid to 0.
 654                  */
 655                 if ((bp->b_flags & B_CACHE) == 0) {
 656                         bp->b_cmd = BUF_CMD_READ;
 657                         bp->b_bio2.bio_done = nfsiodone_sync;
 658                         bp->b_bio2.bio_flags |= BIO_SYNC;
 659                         vfs_busy_pages(vp, bp);
 660                         error = nfs_doio(vp, &bp->b_bio2, td);
 661                         if (error) {
 662                                 brelse(bp);
 663                                 break;
 664                         }
 665                         bp->b_resid = 0;
 666                 }
 667                 np->n_flag |= NLMODIFIED;
 668
 669                 /*
 670                  * If dirtyend exceeds file size, chop it down.  This should
 671                  * not normally occur but there is an append race where it
 672                  * might occur XXX, so we log it.
 673                  *
 674                  * If the chopping creates a reverse-indexed or degenerate
 675                  * situation with dirtyoff/end, we 0 both of them.
 676                  */
 677                 if (bp->b_dirtyend > bcount) {
 678                         kprintf("NFS append race @%08llx:%d\n",
 679                             (long long)bp->b_bio2.bio_offset,
 680                             bp->b_dirtyend - bcount);
 681                         bp->b_dirtyend = bcount;
 682                 }
 683
 684                 if (bp->b_dirtyoff >= bp->b_dirtyend)
 685                         bp->b_dirtyoff = bp->b_dirtyend = 0;
 686
 687                 /*
 688                  * If the new write will leave a contiguous dirty
 689                  * area, just update the b_dirtyoff and b_dirtyend,
 690                  * otherwise force a write rpc of the old dirty area.
 691                  *
 692                  * While it is possible to merge discontiguous writes due to
 693                  * our having a B_CACHE buffer ( and thus valid read data
 694                  * for the hole), we don't because it could lead to
 695                  * significant cache coherency problems with multiple clients,
 696                  * especially if locking is implemented later on.
 697                  *
 698                  * as an optimization we could theoretically maintain
 699                  * a linked list of discontinuous areas, but we would still
 700                  * have to commit them separately so there isn't much
 701                  * advantage to it except perhaps a bit of asynchronization.
 702                  */
 703                 if (bp->b_dirtyend > 0 &&
 704                     (boff > bp->b_dirtyend ||
 705                      (boff + bytes) < bp->b_dirtyoff)
 706                 ) {
 707                         if (bwrite(bp) == EINTR) {
 708                                 error = EINTR;
 709                                 break;
 710                         }
 711                         goto again;
 712                 }
 713
 714                 error = uiomove(bp->b_data + boff, bytes, uio);
 715
 716                 /*
 717                  * Since this block is being modified, it must be written
 718                  * again and not just committed.  Since write clustering does
 719                  * not work for the stage 1 data write, only the stage 2
 720                  * commit rpc, we have to clear B_CLUSTEROK as well.
 721                  */
 722                 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 723
 724                 if (error) {
 725                         brelse(bp);
 726                         break;
 727                 }
 728
 729                 /*
 730                  * Only update dirtyoff/dirtyend if not a degenerate
 731                  * condition.
 732                  *
 733                  * The underlying VM pages have been marked valid by
 734                  * virtue of acquiring the bp.  Because the entire buffer
 735                  * is marked dirty we do not have to worry about cleaning
 736                  * out the related dirty bits (and wouldn't really know
 737                  * how to deal with byte ranges anyway)
 738                  */
 739                 if (bytes) {
 740                         if (bp->b_dirtyend > 0) {
 741                                 bp->b_dirtyoff = imin(boff, bp->b_dirtyoff);
 742                                 bp->b_dirtyend = imax(boff + bytes,
 743                                                       bp->b_dirtyend);
 744                         } else {
 745                                 bp->b_dirtyoff = boff;
 746                                 bp->b_dirtyend = boff + bytes;
 747                         }
 748                 }
 749
 750                 /*
 751                  * If the lease is non-cachable or IO_SYNC do bwrite().
 752                  *
 753                  * IO_INVAL appears to be unused.  The idea appears to be
 754                  * to turn off caching in this case.  Very odd.  XXX
 755                  *
 756                  * If nfs_async is set bawrite() will use an unstable write
 757                  * (build dirty bufs on the server), so we might as well
 758                  * push it out with bawrite().  If nfs_async is not set we
 759                  * use bdwrite() to cache dirty bufs on the client.
 760                  */
 761                 if (ioflag & IO_SYNC) {
 762                         if (ioflag & IO_INVAL)
 763                                 bp->b_flags |= B_NOCACHE;
 764                         error = bwrite(bp);
 765                         if (error)
 766                                 break;
 767                 } else if (boff + bytes == biosize && nfs_async) {
 768                         bawrite(bp);
 769                 } else {
 770                         bdwrite(bp);
 771                 }
 772         } while (uio->uio_resid > 0 && bytes > 0);
 773
 774         if (haverslock)
 775                 nfs_rsunlock(np);
 776
 777         return (error);
 778 }
 779
 780 /*
 781  * Get an nfs cache block.
 782  *
 783  * Allocate a new one if the block isn't currently in the cache
 784  * and return the block marked busy. If the calling process is
 785  * interrupted by a signal for an interruptible mount point, return
 786  * NULL.
 787  *
 788  * The caller must carefully deal with the possible B_INVAL state of
 789  * the buffer.  nfs_startio() clears B_INVAL (and nfs_asyncio() clears it
 790  * indirectly), so synchronous reads can be issued without worrying about
 791  * the B_INVAL state.  We have to be a little more careful when dealing
 792  * with writes (see comments in nfs_write()) when extending a file past
 793  * its EOF.
 794  */
 795 static struct buf *
 796 nfs_getcacheblk(struct vnode *vp, off_t loffset, int size, struct thread *td)
 797 {
 798         struct buf *bp;
 799         struct mount *mp;
 800         struct nfsmount *nmp;
 801
 802         mp = vp->v_mount;
 803         nmp = VFSTONFS(mp);
 804
 805         if (nmp->nm_flag & NFSMNT_INT) {
 806                 bp = getblk(vp, loffset, size, GETBLK_PCATCH, 0);
 807                 while (bp == NULL) {
 808                         if (nfs_sigintr(nmp, NULL, td))
 809                                 return (NULL);
 810                         bp = getblk(vp, loffset, size, 0, 2 * hz);
 811                 }
 812         } else {
 813                 bp = getblk(vp, loffset, size, 0, 0);
 814         }
 815
 816         /*
 817          * bio2, the 'device' layer.  Since BIOs use 64 bit byte offsets
 818          * now, no translation is necessary.
 819          */
 820         bp->b_bio2.bio_offset = loffset;
 821         return (bp);
 822 }
 823
 824 /*
 825  * Flush and invalidate all dirty buffers. If another process is already
 826  * doing the flush, just wait for completion.
 827  */
 828 int
 829 nfs_vinvalbuf(struct vnode *vp, int flags, int intrflg)
 830 {
 831         struct nfsnode *np = VTONFS(vp);
 832         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 833         int error = 0, slpflag, slptimeo;
 834         thread_t td = curthread;
 835
 836         if (vp->v_flag & VRECLAIMED)
 837                 return (0);
 838
 839         if ((nmp->nm_flag & NFSMNT_INT) == 0)
 840                 intrflg = 0;
 841         if (intrflg) {
 842                 slpflag = PCATCH;
 843                 slptimeo = 2 * hz;
 844         } else {
 845                 slpflag = 0;
 846                 slptimeo = 0;
 847         }
 848         /*
 849          * First wait for any other process doing a flush to complete.
 850          */
 851         while (np->n_flag & NFLUSHINPROG) {
 852                 np->n_flag |= NFLUSHWANT;
 853                 error = tsleep((caddr_t)&np->n_flag, 0, "nfsvinval", slptimeo);
 854                 if (error && intrflg && nfs_sigintr(nmp, NULL, td))
 855                         return (EINTR);
 856         }
 857
 858         /*
 859          * Now, flush as required.
 860          */
 861         np->n_flag |= NFLUSHINPROG;
 862         error = vinvalbuf(vp, flags, slpflag, 0);
 863         while (error) {
 864                 if (intrflg && nfs_sigintr(nmp, NULL, td)) {
 865                         np->n_flag &= ~NFLUSHINPROG;
 866                         if (np->n_flag & NFLUSHWANT) {
 867                                 np->n_flag &= ~NFLUSHWANT;
 868                                 wakeup((caddr_t)&np->n_flag);
 869                         }
 870                         return (EINTR);
 871                 }
 872                 error = vinvalbuf(vp, flags, 0, slptimeo);
 873         }
 874         np->n_flag &= ~(NLMODIFIED | NFLUSHINPROG);
 875         if (np->n_flag & NFLUSHWANT) {
 876                 np->n_flag &= ~NFLUSHWANT;
 877                 wakeup((caddr_t)&np->n_flag);
 878         }
 879         return (0);
 880 }
 881
 882 /*
 883  * Return true (non-zero) if the txthread and rxthread are operational
 884  * and we do not already have too many not-yet-started BIO's built up.
 885  */
 886 int
 887 nfs_asyncok(struct nfsmount *nmp)
 888 {
 889         return (nmp->nm_bioqlen < nfs_maxasyncbio &&
 890                 nmp->nm_bioqlen < nmp->nm_maxasync_scaled / NFS_ASYSCALE &&
 891                 nmp->nm_rxstate <= NFSSVC_PENDING &&
 892                 nmp->nm_txstate <= NFSSVC_PENDING);
 893 }
 894
 895 /*
 896  * The read-ahead code calls this to queue a bio to the txthread.
 897  *
 898  * We don't touch the bio otherwise... that is, we do not even
 899  * construct or send the initial rpc.  The txthread will do it
 900  * for us.
 901  *
 902  * NOTE!  nm_bioqlen is not decremented until the request completes,
 903  *        so it does not reflect the number of bio's on bioq.
 904  */
 905 void
 906 nfs_asyncio(struct vnode *vp, struct bio *bio)
 907 {
 908         struct buf *bp = bio->bio_buf;
 909         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 910
 911         KKASSERT(vp->v_tag == VT_NFS);
 912         BUF_KERNPROC(bp);
 913         bio->bio_driver_info = vp;
 914         crit_enter();
 915         TAILQ_INSERT_TAIL(&nmp->nm_bioq, bio, bio_act);
 916         atomic_add_int(&nmp->nm_bioqlen, 1);
 917         crit_exit();
 918         nfssvc_iod_writer_wakeup(nmp);
 919 }
 920
 921 /*
 922  * nfs_dio()    - Execute a BIO operation synchronously.  The BIO will be
 923  *                completed and its error returned.  The caller is responsible
 924  *                for brelse()ing it.  ONLY USE FOR BIO_SYNC IOs!  Otherwise
 925  *                our error probe will be against an invalid pointer.
 926  *
 927  * nfs_startio()- Execute a BIO operation assynchronously.
 928  *
 929  * NOTE: nfs_asyncio() is used to initiate an asynchronous BIO operation,
 930  *       which basically just queues it to the txthread.  nfs_startio()
 931  *       actually initiates the I/O AFTER it has gotten to the txthread.
 932  *
 933  * NOTE: td might be NULL.
 934  *
 935  * NOTE: Caller has already busied the I/O.
 936  */
 937 void
 938 nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
 939 {
 940         struct buf *bp = bio->bio_buf;
 941         struct nfsnode *np;
 942         struct nfsmount *nmp;
 943
 944         KKASSERT(vp->v_tag == VT_NFS);
 945         np = VTONFS(vp);
 946         nmp = VFSTONFS(vp->v_mount);
 947
 948         /*
 949          * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
 950          * do this here so we do not have to do it in all the code that
 951          * calls us.
 952          */
 953         bp->b_flags &= ~(B_ERROR | B_INVAL);
 954
 955         KASSERT(bp->b_cmd != BUF_CMD_DONE,
 956                 ("nfs_doio: bp %p already marked done!", bp));
 957
 958         if (bp->b_cmd == BUF_CMD_READ) {
 959             switch (vp->v_type) {
 960             case VREG:
 961                 nfsstats.read_bios++;
 962                 nfs_readrpc_bio(vp, bio);
 963                 break;
 964             case VLNK:
 965 #if 0
 966                 bio->bio_offset = 0;
 967                 nfsstats.readlink_bios++;
 968                 nfs_readlinkrpc_bio(vp, bio);
 969 #else
 970                 nfs_doio(vp, bio, td);
 971 #endif
 972                 break;
 973             case VDIR:
 974                 /*
 975                  * NOTE: If nfs_readdirplusrpc_bio() is requested but
 976                  *       not supported, it will chain to
 977                  *       nfs_readdirrpc_bio().
 978                  */
 979 #if 0
 980                 nfsstats.readdir_bios++;
 981                 uiop->uio_offset = bio->bio_offset;
 982                 if (nmp->nm_flag & NFSMNT_RDIRPLUS)
 983                         nfs_readdirplusrpc_bio(vp, bio);
 984                 else
 985                         nfs_readdirrpc_bio(vp, bio);
 986 #else
 987                 nfs_doio(vp, bio, td);
 988 #endif
 989                 break;
 990             default:
 991                 kprintf("nfs_doio:  type %x unexpected\n",vp->v_type);
 992                 bp->b_flags |= B_ERROR;
 993                 bp->b_error = EINVAL;
 994                 biodone(bio);
 995                 break;
 996             }
 997         } else {
 998             /*
 999              * If we only need to commit, try to commit.  If this fails
1000              * it will chain through to the write.  Basically all the logic
1001              * in nfs_doio() is replicated.
1002              */
1003             KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
1004             if (bp->b_flags & B_NEEDCOMMIT)
1005                 nfs_commitrpc_bio(vp, bio);
1006             else
1007                 nfs_writerpc_bio(vp, bio);
1008         }
1009 }
1010
1011 int
1012 nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
1013 {
1014         struct buf *bp = bio->bio_buf;
1015         struct uio *uiop;
1016         struct nfsnode *np;
1017         struct nfsmount *nmp;
1018         int error = 0;
1019         int iomode, must_commit;
1020         size_t n;
1021         struct uio uio;
1022         struct iovec io;
1023
1024         KKASSERT(vp->v_tag == VT_NFS);
1025         np = VTONFS(vp);
1026         nmp = VFSTONFS(vp->v_mount);
1027         uiop = &uio;
1028         uiop->uio_iov = &io;
1029         uiop->uio_iovcnt = 1;
1030         uiop->uio_segflg = UIO_SYSSPACE;
1031         uiop->uio_td = td;
1032
1033         /*
1034          * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
1035          * do this here so we do not have to do it in all the code that
1036          * calls us.
1037          */
1038         bp->b_flags &= ~(B_ERROR | B_INVAL);
1039
1040         KASSERT(bp->b_cmd != BUF_CMD_DONE,
1041                 ("nfs_doio: bp %p already marked done!", bp));
1042
1043         if (bp->b_cmd == BUF_CMD_READ) {
1044             io.iov_len = uiop->uio_resid = (size_t)bp->b_bcount;
1045             io.iov_base = bp->b_data;
1046             uiop->uio_rw = UIO_READ;
1047
1048             switch (vp->v_type) {
1049             case VREG:
1050                 /*
1051                  * When reading from a regular file zero-fill any residual.
1052                  * Note that this residual has nothing to do with NFS short
1053                  * reads, which nfs_readrpc_uio() will handle for us.
1054                  *
1055                  * We have to do this because when we are write extending
1056                  * a file the server may not have the same notion of
1057                  * filesize as we do.  Our BIOs should already be sized
1058                  * (b_bcount) to account for the file EOF.
1059                  */
1060                 nfsstats.read_bios++;
1061                 uiop->uio_offset = bio->bio_offset;
1062                 error = nfs_readrpc_uio(vp, uiop);
1063                 if (error == 0 && uiop->uio_resid) {
1064                         n = (size_t)bp->b_bcount - uiop->uio_resid;
1065                         bzero(bp->b_data + n, bp->b_bcount - n);
1066                         uiop->uio_resid = 0;
1067                 }
1068                 if (td && td->td_proc && (vp->v_flag & VTEXT) &&
1069                     np->n_mtime != np->n_vattr.va_mtime.tv_sec) {
1070                         uprintf("Process killed due to text file modification\n");
1071                         ksignal(td->td_proc, SIGKILL);
1072                 }
1073                 break;
1074             case VLNK:
1075                 uiop->uio_offset = 0;
1076                 nfsstats.readlink_bios++;
1077                 error = nfs_readlinkrpc_uio(vp, uiop);
1078                 break;
1079             case VDIR:
1080                 nfsstats.readdir_bios++;
1081                 uiop->uio_offset = bio->bio_offset;
1082                 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1083                         error = nfs_readdirplusrpc_uio(vp, uiop);
1084                         if (error == NFSERR_NOTSUPP)
1085                                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1086                 }
1087                 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1088                         error = nfs_readdirrpc_uio(vp, uiop);
1089                 /*
1090                  * end-of-directory sets B_INVAL but does not generate an
1091                  * error.
1092                  */
1093                 if (error == 0 && uiop->uio_resid == bp->b_bcount)
1094                         bp->b_flags |= B_INVAL;
1095                 break;
1096             default:
1097                 kprintf("nfs_doio:  type %x unexpected\n",vp->v_type);
1098                 break;
1099             };
1100             if (error) {
1101                 bp->b_flags |= B_ERROR;
1102                 bp->b_error = error;
1103             }
1104             bp->b_resid = uiop->uio_resid;
1105         } else {
1106             /*
1107              * If we only need to commit, try to commit.
1108              *
1109              * NOTE: The I/O has already been staged for the write and
1110              *       its pages busied, so b_dirtyoff/end is valid.
1111              */
1112             KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
1113             if (bp->b_flags & B_NEEDCOMMIT) {
1114                     int retv;
1115                     off_t off;
1116
1117                     off = bio->bio_offset + bp->b_dirtyoff;
1118                     retv = nfs_commitrpc_uio(vp, off,
1119                                              bp->b_dirtyend - bp->b_dirtyoff,
1120                                              td);
1121                     if (retv == 0) {
1122                             bp->b_dirtyoff = bp->b_dirtyend = 0;
1123                             bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1124                             bp->b_resid = 0;
1125                             biodone(bio);
1126                             return(0);
1127                     }
1128                     if (retv == NFSERR_STALEWRITEVERF) {
1129                             nfs_clearcommit(vp->v_mount);
1130                     }
1131             }
1132
1133             /*
1134              * Setup for actual write
1135              */
1136             if (bio->bio_offset + bp->b_dirtyend > np->n_size)
1137                 bp->b_dirtyend = np->n_size - bio->bio_offset;
1138
1139             if (bp->b_dirtyend > bp->b_dirtyoff) {
1140                 io.iov_len = uiop->uio_resid = bp->b_dirtyend
1141                     - bp->b_dirtyoff;
1142                 uiop->uio_offset = bio->bio_offset + bp->b_dirtyoff;
1143                 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1144                 uiop->uio_rw = UIO_WRITE;
1145                 nfsstats.write_bios++;
1146
1147                 if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0)
1148                     iomode = NFSV3WRITE_UNSTABLE;
1149                 else
1150                     iomode = NFSV3WRITE_FILESYNC;
1151
1152                 must_commit = 0;
1153                 error = nfs_writerpc_uio(vp, uiop, &iomode, &must_commit);
1154
1155                 /*
1156                  * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1157                  * to cluster the buffers needing commit.  This will allow
1158                  * the system to submit a single commit rpc for the whole
1159                  * cluster.  We can do this even if the buffer is not 100%
1160                  * dirty (relative to the NFS blocksize), so we optimize the
1161                  * append-to-file-case.
1162                  *
1163                  * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1164                  * cleared because write clustering only works for commit
1165                  * rpc's, not for the data portion of the write).
1166                  */
1167
1168                 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1169                     bp->b_flags |= B_NEEDCOMMIT;
1170                     if (bp->b_dirtyoff == 0
1171                         && bp->b_dirtyend == bp->b_bcount)
1172                         bp->b_flags |= B_CLUSTEROK;
1173                 } else {
1174                     bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1175                 }
1176
1177                 /*
1178                  * For an interrupted write, the buffer is still valid
1179                  * and the write hasn't been pushed to the server yet,
1180                  * so we can't set B_ERROR and report the interruption
1181                  * by setting B_EINTR. For the async case, B_EINTR
1182                  * is not relevant, so the rpc attempt is essentially
1183                  * a noop.  For the case of a V3 write rpc not being
1184                  * committed to stable storage, the block is still
1185                  * dirty and requires either a commit rpc or another
1186                  * write rpc with iomode == NFSV3WRITE_FILESYNC before
1187                  * the block is reused. This is indicated by setting
1188                  * the B_DELWRI and B_NEEDCOMMIT flags.
1189                  *
1190                  * If the buffer is marked B_PAGING, it does not reside on
1191                  * the vp's paging queues so we cannot call bdirty().  The
1192                  * bp in this case is not an NFS cache block so we should
1193                  * be safe. XXX
1194                  */
1195                 if (error == EINTR
1196                     || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1197                         crit_enter();
1198                         bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1199                         if ((bp->b_flags & B_PAGING) == 0)
1200                             bdirty(bp);
1201                         if (error)
1202                             bp->b_flags |= B_EINTR;
1203                         crit_exit();
1204                 } else {
1205                     if (error) {
1206                         bp->b_flags |= B_ERROR;
1207                         bp->b_error = np->n_error = error;
1208                         np->n_flag |= NWRITEERR;
1209                     }
1210                     bp->b_dirtyoff = bp->b_dirtyend = 0;
1211                 }
1212                 if (must_commit)
1213                     nfs_clearcommit(vp->v_mount);
1214                 bp->b_resid = uiop->uio_resid;
1215             } else {
1216                 bp->b_resid = 0;
1217             }
1218         }
1219
1220         /*
1221          * I/O was run synchronously, biodone() it and calculate the
1222          * error to return.
1223          */
1224         biodone(bio);
1225         KKASSERT(bp->b_cmd == BUF_CMD_DONE);
1226         if (bp->b_flags & B_EINTR)
1227                 return (EINTR);
1228         if (bp->b_flags & B_ERROR)
1229                 return (bp->b_error ? bp->b_error : EIO);
1230         return (0);
1231 }
1232
1233 /*
1234  * Handle all truncation, write-extend, and ftruncate()-extend operations
1235  * on the NFS lcient side.
1236  *
1237  * We use the new API in kern/vfs_vm.c to perform these operations in a
1238  * VM-friendly way.  With this API VM pages are properly zerod and pages
1239  * still mapped into the buffer straddling EOF are not invalidated.
1240  */
1241 int
1242 nfs_meta_setsize(struct vnode *vp, struct thread *td, off_t nsize, int trivial)
1243 {
1244         struct nfsnode *np = VTONFS(vp);
1245         off_t osize;
1246         int biosize = vp->v_mount->mnt_stat.f_iosize;
1247         int error;
1248
1249         osize = np->n_size;
1250         np->n_size = nsize;
1251
1252         if (nsize < osize) {
1253                 error = nvtruncbuf(vp, nsize, biosize);
1254         } else {
1255                 error = nvextendbuf(vp, osize, nsize,
1256                                     biosize, biosize, trivial);
1257         }
1258         return(error);
1259 }
1260
1261 /*
1262  * Synchronous completion for nfs_doio.  Call bpdone() with elseit=FALSE.
1263  * Caller is responsible for brelse()'ing the bp.
1264  */
1265 static void
1266 nfsiodone_sync(struct bio *bio)
1267 {
1268         bio->bio_flags = 0;
1269         bpdone(bio->bio_buf, 0);
1270 }
1271
1272 /*
1273  * nfs read rpc - BIO version
1274  */
1275 void
1276 nfs_readrpc_bio(struct vnode *vp, struct bio *bio)
1277 {
1278         struct buf *bp = bio->bio_buf;
1279         u_int32_t *tl;
1280         struct nfsmount *nmp;
1281         int error = 0, len, tsiz;
1282         struct nfsm_info *info;
1283
1284         info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
1285         info->mrep = NULL;
1286         info->v3 = NFS_ISV3(vp);
1287
1288         nmp = VFSTONFS(vp->v_mount);
1289         tsiz = bp->b_bcount;
1290         KKASSERT(tsiz <= nmp->nm_rsize);
1291         if (bio->bio_offset + tsiz > nmp->nm_maxfilesize) {
1292                 error = EFBIG;
1293                 goto nfsmout;
1294         }
1295         nfsstats.rpccnt[NFSPROC_READ]++;
1296         len = tsiz;
1297         nfsm_reqhead(info, vp, NFSPROC_READ,
1298                      NFSX_FH(info->v3) + NFSX_UNSIGNED * 3);
1299         ERROROUT(nfsm_fhtom(info, vp));
1300         tl = nfsm_build(info, NFSX_UNSIGNED * 3);
1301         if (info->v3) {
1302                 txdr_hyper(bio->bio_offset, tl);
1303                 *(tl + 2) = txdr_unsigned(len);
1304         } else {
1305                 *tl++ = txdr_unsigned(bio->bio_offset);
1306                 *tl++ = txdr_unsigned(len);
1307                 *tl = 0;
1308         }
1309         info->bio = bio;
1310         info->done = nfs_readrpc_bio_done;
1311         nfsm_request_bio(info, vp, NFSPROC_READ, NULL,
1312                          nfs_vpcred(vp, ND_READ));
1313         return;
1314 nfsmout:
1315         kfree(info, M_NFSREQ);
1316         bp->b_error = error;
1317         bp->b_flags |= B_ERROR;
1318         biodone(bio);
1319 }
1320
1321 static void
1322 nfs_readrpc_bio_done(nfsm_info_t info)
1323 {
1324         struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
1325         struct bio *bio = info->bio;
1326         struct buf *bp = bio->bio_buf;
1327         u_int32_t *tl;
1328         int attrflag;
1329         int retlen;
1330         int eof;
1331         int error = 0;
1332
1333         KKASSERT(info->state == NFSM_STATE_DONE);
1334
1335         if (info->v3) {
1336                 ERROROUT(nfsm_postop_attr(info, info->vp, &attrflag,
1337                                          NFS_LATTR_NOSHRINK));
1338                 NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED));
1339                 eof = fxdr_unsigned(int, *(tl + 1));
1340         } else {
1341                 ERROROUT(nfsm_loadattr(info, info->vp, NULL));
1342                 eof = 0;
1343         }
1344         NEGATIVEOUT(retlen = nfsm_strsiz(info, nmp->nm_rsize));
1345         ERROROUT(nfsm_mtobio(info, bio, retlen));
1346         m_freem(info->mrep);
1347         info->mrep = NULL;
1348
1349         /*
1350          * No error occured, if retlen is less then bcount and no EOF
1351          * and NFSv3 a zero-fill short read occured.
1352          *
1353          * For NFSv2 a short-read indicates EOF.
1354          */
1355         if (retlen < bp->b_bcount && info->v3 && eof == 0) {
1356                 bzero(bp->b_data + retlen, bp->b_bcount - retlen);
1357                 retlen = bp->b_bcount;
1358         }
1359
1360         /*
1361          * If we hit an EOF we still zero-fill, but return the expected
1362          * b_resid anyway.  This should normally not occur since async
1363          * BIOs are not used for read-before-write case.  Races against
1364          * the server can cause it though and we don't want to leave
1365          * garbage in the buffer.
1366          */
1367         if (retlen < bp->b_bcount) {
1368                 bzero(bp->b_data + retlen, bp->b_bcount - retlen);
1369         }
1370         bp->b_resid = 0;
1371         /* bp->b_resid = bp->b_bcount - retlen; */
1372 nfsmout:
1373         kfree(info, M_NFSREQ);
1374         if (error) {
1375                 bp->b_error = error;
1376                 bp->b_flags |= B_ERROR;
1377         }
1378         biodone(bio);
1379 }
1380
1381 /*
1382  * nfs write call - BIO version
1383  *
1384  * NOTE: Caller has already busied the I/O.
1385  */
1386 void
1387 nfs_writerpc_bio(struct vnode *vp, struct bio *bio)
1388 {
1389         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1390         struct nfsnode *np = VTONFS(vp);
1391         struct buf *bp = bio->bio_buf;
1392         u_int32_t *tl;
1393         int len;
1394         int iomode;
1395         int error = 0;
1396         struct nfsm_info *info;
1397         off_t offset;
1398
1399         /*
1400          * Setup for actual write.  Just clean up the bio if there
1401          * is nothing to do.  b_dirtyoff/end have already been staged
1402          * by the bp's pages getting busied.
1403          */
1404         if (bio->bio_offset + bp->b_dirtyend > np->n_size)
1405                 bp->b_dirtyend = np->n_size - bio->bio_offset;
1406
1407         if (bp->b_dirtyend <= bp->b_dirtyoff) {
1408                 bp->b_resid = 0;
1409                 biodone(bio);
1410                 return;
1411         }
1412         len = bp->b_dirtyend - bp->b_dirtyoff;
1413         offset = bio->bio_offset + bp->b_dirtyoff;
1414         if (offset + len > nmp->nm_maxfilesize) {
1415                 bp->b_flags |= B_ERROR;
1416                 bp->b_error = EFBIG;
1417                 biodone(bio);
1418                 return;
1419         }
1420         bp->b_resid = len;
1421         nfsstats.write_bios++;
1422
1423         info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
1424         info->mrep = NULL;
1425         info->v3 = NFS_ISV3(vp);
1426         info->info_writerpc.must_commit = 0;
1427         if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0)
1428                 iomode = NFSV3WRITE_UNSTABLE;
1429         else
1430                 iomode = NFSV3WRITE_FILESYNC;
1431
1432         KKASSERT(len <= nmp->nm_wsize);
1433
1434         nfsstats.rpccnt[NFSPROC_WRITE]++;
1435         nfsm_reqhead(info, vp, NFSPROC_WRITE,
1436                      NFSX_FH(info->v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
1437         ERROROUT(nfsm_fhtom(info, vp));
1438         if (info->v3) {
1439                 tl = nfsm_build(info, 5 * NFSX_UNSIGNED);
1440                 txdr_hyper(offset, tl);
1441                 tl += 2;
1442                 *tl++ = txdr_unsigned(len);
1443                 *tl++ = txdr_unsigned(iomode);
1444                 *tl = txdr_unsigned(len);
1445         } else {
1446                 u_int32_t x;
1447
1448                 tl = nfsm_build(info, 4 * NFSX_UNSIGNED);
1449                 /* Set both "begin" and "current" to non-garbage. */
1450                 x = txdr_unsigned((u_int32_t)offset);
1451                 *tl++ = x;      /* "begin offset" */
1452                 *tl++ = x;      /* "current offset" */
1453                 x = txdr_unsigned(len);
1454                 *tl++ = x;      /* total to this offset */
1455                 *tl = x;        /* size of this write */
1456         }
1457         ERROROUT(nfsm_biotom(info, bio, bp->b_dirtyoff, len));
1458         info->bio = bio;
1459         info->done = nfs_writerpc_bio_done;
1460         nfsm_request_bio(info, vp, NFSPROC_WRITE, NULL,
1461                          nfs_vpcred(vp, ND_WRITE));
1462         return;
1463 nfsmout:
1464         kfree(info, M_NFSREQ);
1465         bp->b_error = error;
1466         bp->b_flags |= B_ERROR;
1467         biodone(bio);
1468 }
1469
1470 static void
1471 nfs_writerpc_bio_done(nfsm_info_t info)
1472 {
1473         struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
1474         struct nfsnode *np = VTONFS(info->vp);
1475         struct bio *bio = info->bio;
1476         struct buf *bp = bio->bio_buf;
1477         int wccflag = NFSV3_WCCRATTR;
1478         int iomode = NFSV3WRITE_FILESYNC;
1479         int commit;
1480         int rlen;
1481         int error;
1482         int len = bp->b_resid;  /* b_resid was set to shortened length */
1483         u_int32_t *tl;
1484
1485         if (info->v3) {
1486                 /*
1487                  * The write RPC returns a before and after mtime.  The
1488                  * nfsm_wcc_data() macro checks the before n_mtime
1489                  * against the before time and stores the after time
1490                  * in the nfsnode's cached vattr and n_mtime field.
1491                  * The NRMODIFIED bit will be set if the before
1492                  * time did not match the original mtime.
1493                  */
1494                 wccflag = NFSV3_WCCCHK;
1495                 ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag));
1496                 if (error == 0) {
1497                         NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF));
1498                         rlen = fxdr_unsigned(int, *tl++);
1499                         if (rlen == 0) {
1500                                 error = NFSERR_IO;
1501                                 m_freem(info->mrep);
1502                                 info->mrep = NULL;
1503                                 goto nfsmout;
1504                         } else if (rlen < len) {
1505 #if 0
1506                                 /*
1507                                  * XXX what do we do here?
1508                                  */
1509                                 backup = len - rlen;
1510                                 uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup;
1511                                 uiop->uio_iov->iov_len += backup;
1512                                 uiop->uio_offset -= backup;
1513                                 uiop->uio_resid += backup;
1514                                 len = rlen;
1515 #endif
1516                         }
1517                         commit = fxdr_unsigned(int, *tl++);
1518
1519                         /*
1520                          * Return the lowest committment level
1521                          * obtained by any of the RPCs.
1522                          */
1523                         if (iomode == NFSV3WRITE_FILESYNC)
1524                                 iomode = commit;
1525                         else if (iomode == NFSV3WRITE_DATASYNC &&
1526                                 commit == NFSV3WRITE_UNSTABLE)
1527                                 iomode = commit;
1528                         if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
1529                             bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF);
1530                             nmp->nm_state |= NFSSTA_HASWRITEVERF;
1531                         } else if (bcmp(tl, nmp->nm_verf, NFSX_V3WRITEVERF)) {
1532                             info->info_writerpc.must_commit = 1;
1533                             bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF);
1534                         }
1535                 }
1536         } else {
1537                 ERROROUT(nfsm_loadattr(info, info->vp, NULL));
1538         }
1539         m_freem(info->mrep);
1540         info->mrep = NULL;
1541         len = 0;
1542 nfsmout:
1543         if (info->vp->v_mount->mnt_flag & MNT_ASYNC)
1544                 iomode = NFSV3WRITE_FILESYNC;
1545         bp->b_resid = len;
1546
1547         /*
1548          * End of RPC.  Now clean up the bp.
1549          *
1550          * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
1551          * to cluster the buffers needing commit.  This will allow
1552          * the system to submit a single commit rpc for the whole
1553          * cluster.  We can do this even if the buffer is not 100%
1554          * dirty (relative to the NFS blocksize), so we optimize the
1555          * append-to-file-case.
1556          *
1557          * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
1558          * cleared because write clustering only works for commit
1559          * rpc's, not for the data portion of the write).
1560          */
1561         if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1562                 bp->b_flags |= B_NEEDCOMMIT;
1563                 if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount)
1564                         bp->b_flags |= B_CLUSTEROK;
1565         } else {
1566                 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1567         }
1568
1569         /*
1570          * For an interrupted write, the buffer is still valid
1571          * and the write hasn't been pushed to the server yet,
1572          * so we can't set B_ERROR and report the interruption
1573          * by setting B_EINTR. For the async case, B_EINTR
1574          * is not relevant, so the rpc attempt is essentially
1575          * a noop.  For the case of a V3 write rpc not being
1576          * committed to stable storage, the block is still
1577          * dirty and requires either a commit rpc or another
1578          * write rpc with iomode == NFSV3WRITE_FILESYNC before
1579          * the block is reused. This is indicated by setting
1580          * the B_DELWRI and B_NEEDCOMMIT flags.
1581          *
1582          * If the buffer is marked B_PAGING, it does not reside on
1583          * the vp's paging queues so we cannot call bdirty().  The
1584          * bp in this case is not an NFS cache block so we should
1585          * be safe. XXX
1586          */
1587         if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1588                 crit_enter();
1589                 bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1590                 if ((bp->b_flags & B_PAGING) == 0)
1591                         bdirty(bp);
1592                 if (error)
1593                         bp->b_flags |= B_EINTR;
1594                 crit_exit();
1595         } else {
1596                 if (error) {
1597                         bp->b_flags |= B_ERROR;
1598                         bp->b_error = np->n_error = error;
1599                         np->n_flag |= NWRITEERR;
1600                 }
1601                 bp->b_dirtyoff = bp->b_dirtyend = 0;
1602         }
1603         if (info->info_writerpc.must_commit)
1604                 nfs_clearcommit(info->vp->v_mount);
1605         kfree(info, M_NFSREQ);
1606         if (error) {
1607                 bp->b_flags |= B_ERROR;
1608                 bp->b_error = error;
1609         }
1610         biodone(bio);
1611 }
1612
1613 /*
1614  * Nfs Version 3 commit rpc - BIO version
1615  *
1616  * This function issues the commit rpc and will chain to a write
1617  * rpc if necessary.
1618  */
1619 void
1620 nfs_commitrpc_bio(struct vnode *vp, struct bio *bio)
1621 {
1622         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1623         struct buf *bp = bio->bio_buf;
1624         struct nfsm_info *info;
1625         int error = 0;
1626         u_int32_t *tl;
1627
1628         if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
1629                 bp->b_dirtyoff = bp->b_dirtyend = 0;
1630                 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1631                 bp->b_resid = 0;
1632                 biodone(bio);
1633                 return;
1634         }
1635
1636         info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
1637         info->mrep = NULL;
1638         info->v3 = 1;
1639
1640         nfsstats.rpccnt[NFSPROC_COMMIT]++;
1641         nfsm_reqhead(info, vp, NFSPROC_COMMIT, NFSX_FH(1));
1642         ERROROUT(nfsm_fhtom(info, vp));
1643         tl = nfsm_build(info, 3 * NFSX_UNSIGNED);
1644         txdr_hyper(bio->bio_offset + bp->b_dirtyoff, tl);
1645         tl += 2;
1646         *tl = txdr_unsigned(bp->b_dirtyend - bp->b_dirtyoff);
1647         info->bio = bio;
1648         info->done = nfs_commitrpc_bio_done;
1649         nfsm_request_bio(info, vp, NFSPROC_COMMIT, NULL,
1650                          nfs_vpcred(vp, ND_WRITE));
1651         return;
1652 nfsmout:
1653         /*
1654          * Chain to write RPC on (early) error
1655          */
1656         kfree(info, M_NFSREQ);
1657         nfs_writerpc_bio(vp, bio);
1658 }
1659
1660 static void
1661 nfs_commitrpc_bio_done(nfsm_info_t info)
1662 {
1663         struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
1664         struct bio *bio = info->bio;
1665         struct buf *bp = bio->bio_buf;
1666         u_int32_t *tl;
1667         int wccflag = NFSV3_WCCRATTR;
1668         int error = 0;
1669
1670         ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag));
1671         if (error == 0) {
1672                 NULLOUT(tl = nfsm_dissect(info, NFSX_V3WRITEVERF));
1673                 if (bcmp(nmp->nm_verf, tl, NFSX_V3WRITEVERF)) {
1674                         bcopy(tl, nmp->nm_verf, NFSX_V3WRITEVERF);
1675                         error = NFSERR_STALEWRITEVERF;
1676                 }
1677         }
1678         m_freem(info->mrep);
1679         info->mrep = NULL;
1680
1681         /*
1682          * On completion we must chain to a write bio if an
1683          * error occurred.
1684          */
1685 nfsmout:
1686         kfree(info, M_NFSREQ);
1687         if (error == 0) {
1688                 bp->b_dirtyoff = bp->b_dirtyend = 0;
1689                 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1690                 bp->b_resid = 0;
1691                 biodone(bio);
1692         } else {
1693                 nfs_writerpc_bio(info->vp, bio);
1694         }
1695 }
1696