sys/vfs/nfs/nfs_bio.c

   1 /*
   2  * Copyright (c) 1989, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * This code is derived from software contributed to Berkeley by
   6  * Rick Macklem at The University of Guelph.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  * 3. All advertising materials mentioning features or use of this software
  17  *    must display the following acknowledgement:
  18  *      This product includes software developed by the University of
  19  *      California, Berkeley and its contributors.
  20  * 4. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  *
  36  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
  37  * $FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_bio.c,v 1.130 2004/04/14 23:23:55 peadar Exp $
  38  * $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.45 2008/07/18 00:09:39 dillon Exp $
  39  */
  40
  41
  42 #include <sys/param.h>
  43 #include <sys/systm.h>
  44 #include <sys/resourcevar.h>
  45 #include <sys/signalvar.h>
  46 #include <sys/proc.h>
  47 #include <sys/buf.h>
  48 #include <sys/vnode.h>
  49 #include <sys/mount.h>
  50 #include <sys/kernel.h>
  51 #include <sys/mbuf.h>
  52 #include <sys/msfbuf.h>
  53
  54 #include <vm/vm.h>
  55 #include <vm/vm_extern.h>
  56 #include <vm/vm_page.h>
  57 #include <vm/vm_object.h>
  58 #include <vm/vm_pager.h>
  59 #include <vm/vnode_pager.h>
  60
  61 #include <sys/buf2.h>
  62 #include <sys/thread2.h>
  63 #include <vm/vm_page2.h>
  64
  65 #include "rpcv2.h"
  66 #include "nfsproto.h"
  67 #include "nfs.h"
  68 #include "nfsmount.h"
  69 #include "nfsnode.h"
  70 #include "xdr_subs.h"
  71 #include "nfsm_subs.h"
  72
  73
  74 static struct buf *nfs_getcacheblk(struct vnode *vp, off_t loffset,
  75                                    int size, struct thread *td);
  76 static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen);
  77 static void nfsiodone_sync(struct bio *bio);
  78 static void nfs_readrpc_bio_done(nfsm_info_t info);
  79 static void nfs_writerpc_bio_done(nfsm_info_t info);
  80 static void nfs_commitrpc_bio_done(nfsm_info_t info);
  81
  82 /*
  83  * Vnode op for read using bio
  84  */
  85 int
  86 nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag)
  87 {
  88         struct nfsnode *np = VTONFS(vp);
  89         int biosize, i;
  90         struct buf *bp, *rabp;
  91         struct vattr vattr;
  92         struct thread *td;
  93         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
  94         off_t lbn, rabn;
  95         off_t raoffset;
  96         off_t loffset;
  97         int seqcount;
  98         int nra, error = 0;
  99         int boff = 0;
 100         size_t n;
 101
 102 #ifdef DIAGNOSTIC
 103         if (uio->uio_rw != UIO_READ)
 104                 panic("nfs_read mode");
 105 #endif
 106         if (uio->uio_resid == 0)
 107                 return (0);
 108         if (uio->uio_offset < 0)        /* XXX VDIR cookies can be negative */
 109                 return (EINVAL);
 110         td = uio->uio_td;
 111
 112         if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 113             (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 114                 (void)nfs_fsinfo(nmp, vp, td);
 115         if (vp->v_type != VDIR &&
 116             (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 117                 return (EFBIG);
 118         biosize = vp->v_mount->mnt_stat.f_iosize;
 119         seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
 120
 121         /*
 122          * For nfs, cache consistency can only be maintained approximately.
 123          * Although RFC1094 does not specify the criteria, the following is
 124          * believed to be compatible with the reference port.
 125          *
 126          * NFS:         If local changes have been made and this is a
 127          *              directory, the directory must be invalidated and
 128          *              the attribute cache must be cleared.
 129          *
 130          *              GETATTR is called to synchronize the file size.
 131          *
 132          *              If remote changes are detected local data is flushed
 133          *              and the cache is invalidated.
 134          *
 135          *              NOTE: In the normal case the attribute cache is not
 136          *              cleared which means GETATTR may use cached data and
 137          *              not immediately detect changes made on the server.
 138          */
 139         if ((np->n_flag & NLMODIFIED) && vp->v_type == VDIR) {
 140                 nfs_invaldir(vp);
 141                 error = nfs_vinvalbuf(vp, V_SAVE, 1);
 142                 if (error)
 143                         return (error);
 144                 np->n_attrstamp = 0;
 145         }
 146         error = VOP_GETATTR(vp, &vattr);
 147         if (error)
 148                 return (error);
 149
 150         /*
 151          * This can deadlock getpages/putpages for regular
 152          * files.  Only do it for directories.
 153          */
 154         if (np->n_flag & NRMODIFIED) {
 155                 if (vp->v_type == VDIR) {
 156                         nfs_invaldir(vp);
 157                         error = nfs_vinvalbuf(vp, V_SAVE, 1);
 158                         if (error)
 159                                 return (error);
 160                         np->n_flag &= ~NRMODIFIED;
 161                 }
 162         }
 163
 164         /*
 165          * Loop until uio exhausted or we hit EOF
 166          */
 167         do {
 168             bp = NULL;
 169
 170             switch (vp->v_type) {
 171             case VREG:
 172                 nfsstats.biocache_reads++;
 173                 lbn = uio->uio_offset / biosize;
 174                 boff = uio->uio_offset & (biosize - 1);
 175                 loffset = (off_t)lbn * biosize;
 176
 177                 /*
 178                  * Start the read ahead(s), as required.
 179                  */
 180                 if (nmp->nm_readahead > 0 && nfs_asyncok(nmp)) {
 181                     for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
 182                         (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
 183                         rabn = lbn + 1 + nra;
 184                         raoffset = (off_t)rabn * biosize;
 185                         if (findblk(vp, raoffset, FINDBLK_TEST) == NULL) {
 186                             rabp = nfs_getcacheblk(vp, raoffset, biosize, td);
 187                             if (!rabp)
 188                                 return (EINTR);
 189                             if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 190                                 rabp->b_cmd = BUF_CMD_READ;
 191                                 vfs_busy_pages(vp, rabp);
 192                                 nfs_asyncio(vp, &rabp->b_bio2);
 193                             } else {
 194                                 brelse(rabp);
 195                             }
 196                         }
 197                     }
 198                 }
 199
 200                 /*
 201                  * Obtain the buffer cache block.  Figure out the buffer size
 202                  * when we are at EOF.  If we are modifying the size of the
 203                  * buffer based on an EOF condition we need to hold
 204                  * nfs_rslock() through obtaining the buffer to prevent
 205                  * a potential writer-appender from messing with n_size.
 206                  * Otherwise we may accidently truncate the buffer and
 207                  * lose dirty data.
 208                  *
 209                  * Note that bcount is *not* DEV_BSIZE aligned.
 210                  */
 211                 if (loffset + boff >= np->n_size) {
 212                         n = 0;
 213                         break;
 214                 }
 215                 bp = nfs_getcacheblk(vp, loffset, biosize, td);
 216
 217                 if (bp == NULL)
 218                         return (EINTR);
 219
 220                 /*
 221                  * If B_CACHE is not set, we must issue the read.  If this
 222                  * fails, we return an error.
 223                  */
 224                 if ((bp->b_flags & B_CACHE) == 0) {
 225                         bp->b_cmd = BUF_CMD_READ;
 226                         bp->b_bio2.bio_done = nfsiodone_sync;
 227                         bp->b_bio2.bio_flags |= BIO_SYNC;
 228                         vfs_busy_pages(vp, bp);
 229                         error = nfs_doio(vp, &bp->b_bio2, td);
 230                         if (error) {
 231                                 brelse(bp);
 232                                 return (error);
 233                         }
 234                 }
 235
 236                 /*
 237                  * on is the offset into the current bp.  Figure out how many
 238                  * bytes we can copy out of the bp.  Note that bcount is
 239                  * NOT DEV_BSIZE aligned.
 240                  *
 241                  * Then figure out how many bytes we can copy into the uio.
 242                  */
 243                 n = biosize - boff;
 244                 if (n > uio->uio_resid)
 245                         n = uio->uio_resid;
 246                 if (loffset + boff + n > np->n_size)
 247                         n = np->n_size - loffset - boff;
 248                 break;
 249             case VLNK:
 250                 biosize = min(NFS_MAXPATHLEN, np->n_size);
 251                 nfsstats.biocache_readlinks++;
 252                 bp = nfs_getcacheblk(vp, (off_t)0, biosize, td);
 253                 if (bp == NULL)
 254                         return (EINTR);
 255                 if ((bp->b_flags & B_CACHE) == 0) {
 256                         bp->b_cmd = BUF_CMD_READ;
 257                         bp->b_bio2.bio_done = nfsiodone_sync;
 258                         bp->b_bio2.bio_flags |= BIO_SYNC;
 259                         vfs_busy_pages(vp, bp);
 260                         error = nfs_doio(vp, &bp->b_bio2, td);
 261                         if (error) {
 262                                 bp->b_flags |= B_ERROR | B_INVAL;
 263                                 brelse(bp);
 264                                 return (error);
 265                         }
 266                 }
 267                 n = szmin(uio->uio_resid, (size_t)bp->b_bcount - bp->b_resid);
 268                 boff = 0;
 269                 break;
 270             case VDIR:
 271                 nfsstats.biocache_readdirs++;
 272                 if (np->n_direofoffset &&
 273                     uio->uio_offset >= np->n_direofoffset
 274                 ) {
 275                         return (0);
 276                 }
 277                 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
 278                 boff = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 279                 loffset = uio->uio_offset - boff;
 280                 bp = nfs_getcacheblk(vp, loffset, NFS_DIRBLKSIZ, td);
 281                 if (bp == NULL)
 282                         return (EINTR);
 283
 284                 if ((bp->b_flags & B_CACHE) == 0) {
 285                     bp->b_cmd = BUF_CMD_READ;
 286                     bp->b_bio2.bio_done = nfsiodone_sync;
 287                     bp->b_bio2.bio_flags |= BIO_SYNC;
 288                     vfs_busy_pages(vp, bp);
 289                     error = nfs_doio(vp, &bp->b_bio2, td);
 290                     if (error)
 291                             brelse(bp);
 292                     while (error == NFSERR_BAD_COOKIE) {
 293                         kprintf("got bad cookie vp %p bp %p\n", vp, bp);
 294                         nfs_invaldir(vp);
 295                         error = nfs_vinvalbuf(vp, 0, 1);
 296                         /*
 297                          * Yuck! The directory has been modified on the
 298                          * server. The only way to get the block is by
 299                          * reading from the beginning to get all the
 300                          * offset cookies.
 301                          *
 302                          * Leave the last bp intact unless there is an error.
 303                          * Loop back up to the while if the error is another
 304                          * NFSERR_BAD_COOKIE (double yuch!).
 305                          */
 306                         for (i = 0; i <= lbn && !error; i++) {
 307                             if (np->n_direofoffset
 308                                 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 309                                     return (0);
 310                             bp = nfs_getcacheblk(vp, (off_t)i * NFS_DIRBLKSIZ,
 311                                                  NFS_DIRBLKSIZ, td);
 312                             if (!bp)
 313                                 return (EINTR);
 314                             if ((bp->b_flags & B_CACHE) == 0) {
 315                                     bp->b_cmd = BUF_CMD_READ;
 316                                     bp->b_bio2.bio_done = nfsiodone_sync;
 317                                     bp->b_bio2.bio_flags |= BIO_SYNC;
 318                                     vfs_busy_pages(vp, bp);
 319                                     error = nfs_doio(vp, &bp->b_bio2, td);
 320                                     /*
 321                                      * no error + B_INVAL == directory EOF,
 322                                      * use the block.
 323                                      */
 324                                     if (error == 0 && (bp->b_flags & B_INVAL))
 325                                             break;
 326                             }
 327                             /*
 328                              * An error will throw away the block and the
 329                              * for loop will break out.  If no error and this
 330                              * is not the block we want, we throw away the
 331                              * block and go for the next one via the for loop.
 332                              */
 333                             if (error || i < lbn)
 334                                     brelse(bp);
 335                         }
 336                     }
 337                     /*
 338                      * The above while is repeated if we hit another cookie
 339                      * error.  If we hit an error and it wasn't a cookie error,
 340                      * we give up.
 341                      */
 342                     if (error)
 343                             return (error);
 344                 }
 345
 346                 /*
 347                  * If not eof and read aheads are enabled, start one.
 348                  * (You need the current block first, so that you have the
 349                  *  directory offset cookie of the next block.)
 350                  */
 351                 if (nmp->nm_readahead > 0 && nfs_asyncok(nmp) &&
 352                     (bp->b_flags & B_INVAL) == 0 &&
 353                     (np->n_direofoffset == 0 ||
 354                     loffset + NFS_DIRBLKSIZ < np->n_direofoffset) &&
 355                     findblk(vp, loffset + NFS_DIRBLKSIZ, FINDBLK_TEST) == NULL
 356                 ) {
 357                         rabp = nfs_getcacheblk(vp, loffset + NFS_DIRBLKSIZ,
 358                                                NFS_DIRBLKSIZ, td);
 359                         if (rabp) {
 360                             if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 361                                 rabp->b_cmd = BUF_CMD_READ;
 362                                 vfs_busy_pages(vp, rabp);
 363                                 nfs_asyncio(vp, &rabp->b_bio2);
 364                             } else {
 365                                 brelse(rabp);
 366                             }
 367                         }
 368                 }
 369                 /*
 370                  * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 371                  * chopped for the EOF condition, we cannot tell how large
 372                  * NFS directories are going to be until we hit EOF.  So
 373                  * an NFS directory buffer is *not* chopped to its EOF.  Now,
 374                  * it just so happens that b_resid will effectively chop it
 375                  * to EOF.  *BUT* this information is lost if the buffer goes
 376                  * away and is reconstituted into a B_CACHE state ( due to
 377                  * being VMIO ) later.  So we keep track of the directory eof
 378                  * in np->n_direofoffset and chop it off as an extra step
 379                  * right here.
 380                  *
 381                  * NOTE: boff could already be beyond EOF.
 382                  */
 383                 if ((size_t)boff > NFS_DIRBLKSIZ - bp->b_resid) {
 384                         n = 0;
 385                 } else {
 386                         n = szmin(uio->uio_resid,
 387                                   NFS_DIRBLKSIZ - bp->b_resid - (size_t)boff);
 388                 }
 389                 if (np->n_direofoffset &&
 390                     n > (size_t)(np->n_direofoffset - uio->uio_offset)) {
 391                         n = (size_t)(np->n_direofoffset - uio->uio_offset);
 392                 }
 393                 break;
 394             default:
 395                 kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 396                 n = 0;
 397                 break;
 398             };
 399
 400             switch (vp->v_type) {
 401             case VREG:
 402                 if (n > 0)
 403                     error = uiomove(bp->b_data + boff, n, uio);
 404                 break;
 405             case VLNK:
 406                 if (n > 0)
 407                     error = uiomove(bp->b_data + boff, n, uio);
 408                 n = 0;
 409                 break;
 410             case VDIR:
 411                 if (n > 0) {
 412                     off_t old_off = uio->uio_offset;
 413                     caddr_t cpos, epos;
 414                     struct nfs_dirent *dp;
 415
 416                     /*
 417                      * We are casting cpos to nfs_dirent, it must be
 418                      * int-aligned.
 419                      */
 420                     if (boff & 3) {
 421                         error = EINVAL;
 422                         break;
 423                     }
 424
 425                     cpos = bp->b_data + boff;
 426                     epos = bp->b_data + boff + n;
 427                     while (cpos < epos && error == 0 && uio->uio_resid > 0) {
 428                             dp = (struct nfs_dirent *)cpos;
 429                             error = nfs_check_dirent(dp, (int)(epos - cpos));
 430                             if (error)
 431                                     break;
 432                             if (vop_write_dirent(&error, uio, dp->nfs_ino,
 433                                 dp->nfs_type, dp->nfs_namlen, dp->nfs_name)) {
 434                                     break;
 435                             }
 436                             cpos += dp->nfs_reclen;
 437                     }
 438                     n = 0;
 439                     if (error == 0) {
 440                             uio->uio_offset = old_off + cpos -
 441                                               bp->b_data - boff;
 442                     }
 443                 }
 444                 break;
 445             default:
 446                 kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type);
 447             }
 448             if (bp)
 449                     brelse(bp);
 450         } while (error == 0 && uio->uio_resid > 0 && n > 0);
 451         return (error);
 452 }
 453
 454 /*
 455  * Userland can supply any 'seek' offset when reading a NFS directory.
 456  * Validate the structure so we don't panic the kernel.  Note that
 457  * the element name is nul terminated and the nul is not included
 458  * in nfs_namlen.
 459  */
 460 static
 461 int
 462 nfs_check_dirent(struct nfs_dirent *dp, int maxlen)
 463 {
 464         int nfs_name_off = offsetof(struct nfs_dirent, nfs_name[0]);
 465
 466         if (nfs_name_off >= maxlen)
 467                 return (EINVAL);
 468         if (dp->nfs_reclen < nfs_name_off || dp->nfs_reclen > maxlen)
 469                 return (EINVAL);
 470         if (nfs_name_off + dp->nfs_namlen >= dp->nfs_reclen)
 471                 return (EINVAL);
 472         if (dp->nfs_reclen & 3)
 473                 return (EINVAL);
 474         return (0);
 475 }
 476
 477 /*
 478  * Vnode op for write using bio
 479  *
 480  * nfs_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag,
 481  *           struct ucred *a_cred)
 482  */
 483 int
 484 nfs_write(struct vop_write_args *ap)
 485 {
 486         struct uio *uio = ap->a_uio;
 487         struct thread *td = uio->uio_td;
 488         struct vnode *vp = ap->a_vp;
 489         struct nfsnode *np = VTONFS(vp);
 490         int ioflag = ap->a_ioflag;
 491         struct buf *bp;
 492         struct vattr vattr;
 493         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 494         off_t loffset;
 495         int boff, bytes;
 496         int error = 0;
 497         int haverslock = 0;
 498         int bcount;
 499         int biosize;
 500         int trivial;
 501
 502 #ifdef DIAGNOSTIC
 503         if (uio->uio_rw != UIO_WRITE)
 504                 panic("nfs_write mode");
 505         if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
 506                 panic("nfs_write proc");
 507 #endif
 508         if (vp->v_type != VREG)
 509                 return (EIO);
 510         if (np->n_flag & NWRITEERR) {
 511                 np->n_flag &= ~NWRITEERR;
 512                 return (np->n_error);
 513         }
 514         if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 515             (nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
 516                 (void)nfs_fsinfo(nmp, vp, td);
 517
 518         /*
 519          * Synchronously flush pending buffers if we are in synchronous
 520          * mode or if we are appending.
 521          */
 522         if (ioflag & (IO_APPEND | IO_SYNC)) {
 523                 if (np->n_flag & NLMODIFIED) {
 524                         np->n_attrstamp = 0;
 525                         error = nfs_flush(vp, MNT_WAIT, td, 0);
 526                         /* error = nfs_vinvalbuf(vp, V_SAVE, 1); */
 527                         if (error)
 528                                 return (error);
 529                 }
 530         }
 531
 532         /*
 533          * If IO_APPEND then load uio_offset.  We restart here if we cannot
 534          * get the append lock.
 535          */
 536 restart:
 537         if (ioflag & IO_APPEND) {
 538                 np->n_attrstamp = 0;
 539                 error = VOP_GETATTR(vp, &vattr);
 540                 if (error)
 541                         return (error);
 542                 uio->uio_offset = np->n_size;
 543         }
 544
 545         if (uio->uio_offset < 0)
 546                 return (EINVAL);
 547         if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 548                 return (EFBIG);
 549         if (uio->uio_resid == 0)
 550                 return (0);
 551
 552         /*
 553          * We need to obtain the rslock if we intend to modify np->n_size
 554          * in order to guarentee the append point with multiple contending
 555          * writers, to guarentee that no other appenders modify n_size
 556          * while we are trying to obtain a truncated buffer (i.e. to avoid
 557          * accidently truncating data written by another appender due to
 558          * the race), and to ensure that the buffer is populated prior to
 559          * our extending of the file.  We hold rslock through the entire
 560          * operation.
 561          *
 562          * Note that we do not synchronize the case where someone truncates
 563          * the file while we are appending to it because attempting to lock
 564          * this case may deadlock other parts of the system unexpectedly.
 565          */
 566         if ((ioflag & IO_APPEND) ||
 567             uio->uio_offset + uio->uio_resid > np->n_size) {
 568                 switch(nfs_rslock(np)) {
 569                 case ENOLCK:
 570                         goto restart;
 571                         /* not reached */
 572                 case EINTR:
 573                 case ERESTART:
 574                         return(EINTR);
 575                         /* not reached */
 576                 default:
 577                         break;
 578                 }
 579                 haverslock = 1;
 580         }
 581
 582         /*
 583          * Maybe this should be above the vnode op call, but so long as
 584          * file servers have no limits, i don't think it matters
 585          */
 586         if (td && td->td_proc && uio->uio_offset + uio->uio_resid >
 587               td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 588                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
 589                 if (haverslock)
 590                         nfs_rsunlock(np);
 591                 return (EFBIG);
 592         }
 593
 594         biosize = vp->v_mount->mnt_stat.f_iosize;
 595
 596         do {
 597                 nfsstats.biocache_writes++;
 598                 boff = uio->uio_offset & (biosize-1);
 599                 loffset = uio->uio_offset - boff;
 600                 bytes = (int)szmin((unsigned)(biosize - boff), uio->uio_resid);
 601 again:
 602                 /*
 603                  * Handle direct append and file extension cases, calculate
 604                  * unaligned buffer size.  When extending B_CACHE will be
 605                  * set if possible.  See UIO_NOCOPY note below.
 606                  */
 607                 if (uio->uio_offset + bytes > np->n_size) {
 608                         np->n_flag |= NLMODIFIED;
 609                         trivial = (uio->uio_segflg != UIO_NOCOPY &&
 610                                    uio->uio_offset <= np->n_size);
 611                         nfs_meta_setsize(vp, td, uio->uio_offset + bytes,
 612                                          trivial);
 613                 }
 614                 bp = nfs_getcacheblk(vp, loffset, biosize, td);
 615                 if (bp == NULL) {
 616                         error = EINTR;
 617                         break;
 618                 }
 619
 620                 /*
 621                  * Actual bytes in buffer which we care about
 622                  */
 623                 if (loffset + biosize < np->n_size)
 624                         bcount = biosize;
 625                 else
 626                         bcount = (int)(np->n_size - loffset);
 627
 628                 /*
 629                  * Avoid a read by setting B_CACHE where the data we
 630                  * intend to write covers the entire buffer.  Note
 631                  * that the buffer may have been set to B_CACHE by
 632                  * nfs_meta_setsize() above or otherwise inherited the
 633                  * flag, but if B_CACHE isn't set the buffer may be
 634                  * uninitialized and must be zero'd to accomodate
 635                  * future seek+write's.
 636                  *
 637                  * See the comments in kern/vfs_bio.c's getblk() for
 638                  * more information.
 639                  *
 640                  * When doing a UIO_NOCOPY write the buffer is not
 641                  * overwritten and we cannot just set B_CACHE unconditionally
 642                  * for full-block writes.
 643                  */
 644                 if (boff == 0 && bytes == biosize &&
 645                     uio->uio_segflg != UIO_NOCOPY) {
 646                         bp->b_flags |= B_CACHE;
 647                         bp->b_flags &= ~(B_ERROR | B_INVAL);
 648                 }
 649
 650                 /*
 651                  * b_resid may be set due to file EOF if we extended out.
 652                  * The NFS bio code will zero the difference anyway so
 653                  * just acknowledged the fact and set b_resid to 0.
 654                  */
 655                 if ((bp->b_flags & B_CACHE) == 0) {
 656                         bp->b_cmd = BUF_CMD_READ;
 657                         bp->b_bio2.bio_done = nfsiodone_sync;
 658                         bp->b_bio2.bio_flags |= BIO_SYNC;
 659                         vfs_busy_pages(vp, bp);
 660                         error = nfs_doio(vp, &bp->b_bio2, td);
 661                         if (error) {
 662                                 brelse(bp);
 663                                 break;
 664                         }
 665                         bp->b_resid = 0;
 666                 }
 667                 np->n_flag |= NLMODIFIED;
 668
 669                 /*
 670                  * If dirtyend exceeds file size, chop it down.  This should
 671                  * not normally occur but there is an append race where it
 672                  * might occur XXX, so we log it.
 673                  *
 674                  * If the chopping creates a reverse-indexed or degenerate
 675                  * situation with dirtyoff/end, we 0 both of them.
 676                  */
 677                 if (bp->b_dirtyend > bcount) {
 678                         kprintf("NFS append race @%08llx:%d\n",
 679                             (long long)bp->b_bio2.bio_offset,
 680                             bp->b_dirtyend - bcount);
 681                         bp->b_dirtyend = bcount;
 682                 }
 683
 684                 if (bp->b_dirtyoff >= bp->b_dirtyend)
 685                         bp->b_dirtyoff = bp->b_dirtyend = 0;
 686
 687                 /*
 688                  * If the new write will leave a contiguous dirty
 689                  * area, just update the b_dirtyoff and b_dirtyend,
 690                  * otherwise force a write rpc of the old dirty area.
 691                  *
 692                  * While it is possible to merge discontiguous writes due to
 693                  * our having a B_CACHE buffer ( and thus valid read data
 694                  * for the hole), we don't because it could lead to
 695                  * significant cache coherency problems with multiple clients,
 696                  * especially if locking is implemented later on.
 697                  *
 698                  * as an optimization we could theoretically maintain
 699                  * a linked list of discontinuous areas, but we would still
 700                  * have to commit them separately so there isn't much
 701                  * advantage to it except perhaps a bit of asynchronization.
 702                  */
 703                 if (bp->b_dirtyend > 0 &&
 704                     (boff > bp->b_dirtyend ||
 705                      (boff + bytes) < bp->b_dirtyoff)
 706                 ) {
 707                         if (bwrite(bp) == EINTR) {
 708                                 error = EINTR;
 709                                 break;
 710                         }
 711                         goto again;
 712                 }
 713
 714                 error = uiomove(bp->b_data + boff, bytes, uio);
 715
 716                 /*
 717                  * Since this block is being modified, it must be written
 718                  * again and not just committed.  Since write clustering does
 719                  * not work for the stage 1 data write, only the stage 2
 720                  * commit rpc, we have to clear B_CLUSTEROK as well.
 721                  */
 722                 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 723
 724                 if (error) {
 725                         brelse(bp);
 726                         break;
 727                 }
 728
 729                 /*
 730                  * Only update dirtyoff/dirtyend if not a degenerate
 731                  * condition.
 732                  *
 733                  * The underlying VM pages have been marked valid by
 734                  * virtue of acquiring the bp.  Because the entire buffer
 735                  * is marked dirty we do not have to worry about cleaning
 736                  * out the related dirty bits (and wouldn't really know
 737                  * how to deal with byte ranges anyway)
 738                  */
 739                 if (bytes) {
 740                         if (bp->b_dirtyend > 0) {
 741                                 bp->b_dirtyoff = imin(boff, bp->b_dirtyoff);
 742                                 bp->b_dirtyend = imax(boff + bytes,
 743                                                       bp->b_dirtyend);
 744                         } else {
 745                                 bp->b_dirtyoff = boff;
 746                                 bp->b_dirtyend = boff + bytes;
 747                         }
 748                 }
 749
 750                 /*
 751                  * If the lease is non-cachable or IO_SYNC do bwrite().
 752                  *
 753                  * IO_INVAL appears to be unused.  The idea appears to be
 754                  * to turn off caching in this case.  Very odd.  XXX
 755                  *
 756                  * If nfs_async is set bawrite() will use an unstable write
 757                  * (build dirty bufs on the server), so we might as well
 758                  * push it out with bawrite().  If nfs_async is not set we
 759                  * use bdwrite() to cache dirty bufs on the client.
 760                  */
 761                 if (ioflag & IO_SYNC) {
 762                         if (ioflag & IO_INVAL)
 763                                 bp->b_flags |= B_NOCACHE;
 764                         error = bwrite(bp);
 765                         if (error)
 766                                 break;
 767                 } else if (boff + bytes == biosize && nfs_async) {
 768                         bawrite(bp);
 769                 } else {
 770                         bdwrite(bp);
 771                 }
 772         } while (uio->uio_resid > 0 && bytes > 0);
 773
 774         if (haverslock)
 775                 nfs_rsunlock(np);
 776
 777         return (error);
 778 }
 779
 780 /*
 781  * Get an nfs cache block.
 782  *
 783  * Allocate a new one if the block isn't currently in the cache
 784  * and return the block marked busy. If the calling process is
 785  * interrupted by a signal for an interruptible mount point, return
 786  * NULL.
 787  *
 788  * The caller must carefully deal with the possible B_INVAL state of
 789  * the buffer.  nfs_startio() clears B_INVAL (and nfs_asyncio() clears it
 790  * indirectly), so synchronous reads can be issued without worrying about
 791  * the B_INVAL state.  We have to be a little more careful when dealing
 792  * with writes (see comments in nfs_write()) when extending a file past
 793  * its EOF.
 794  */
 795 static struct buf *
 796 nfs_getcacheblk(struct vnode *vp, off_t loffset, int size, struct thread *td)
 797 {
 798         struct buf *bp;
 799         struct mount *mp;
 800         struct nfsmount *nmp;
 801
 802         mp = vp->v_mount;
 803         nmp = VFSTONFS(mp);
 804
 805         if (nmp->nm_flag & NFSMNT_INT) {
 806                 bp = getblk(vp, loffset, size, GETBLK_PCATCH, 0);
 807                 while (bp == NULL) {
 808                         if (nfs_sigintr(nmp, NULL, td))
 809                                 return (NULL);
 810                         bp = getblk(vp, loffset, size, 0, 2 * hz);
 811                 }
 812         } else {
 813                 bp = getblk(vp, loffset, size, 0, 0);
 814         }
 815
 816         /*
 817          * bio2, the 'device' layer.  Since BIOs use 64 bit byte offsets
 818          * now, no translation is necessary.
 819          */
 820         bp->b_bio2.bio_offset = loffset;
 821         return (bp);
 822 }
 823
 824 /*
 825  * Flush and invalidate all dirty buffers. If another process is already
 826  * doing the flush, just wait for completion.
 827  */
 828 int
 829 nfs_vinvalbuf(struct vnode *vp, int flags, int intrflg)
 830 {
 831         struct nfsnode *np = VTONFS(vp);
 832         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 833         int error = 0, slpflag, slptimeo;
 834         thread_t td = curthread;
 835
 836         if (vp->v_flag & VRECLAIMED)
 837                 return (0);
 838
 839         if ((nmp->nm_flag & NFSMNT_INT) == 0)
 840                 intrflg = 0;
 841         if (intrflg) {
 842                 slpflag = PCATCH;
 843                 slptimeo = 2 * hz;
 844         } else {
 845                 slpflag = 0;
 846                 slptimeo = 0;
 847         }
 848         /*
 849          * First wait for any other process doing a flush to complete.
 850          */
 851         while (np->n_flag & NFLUSHINPROG) {
 852                 np->n_flag |= NFLUSHWANT;
 853                 error = tsleep((caddr_t)&np->n_flag, 0, "nfsvinval", slptimeo);
 854                 if (error && intrflg && nfs_sigintr(nmp, NULL, td))
 855                         return (EINTR);
 856         }
 857
 858         /*
 859          * Now, flush as required.
 860          */
 861         np->n_flag |= NFLUSHINPROG;
 862         error = vinvalbuf(vp, flags, slpflag, 0);
 863         while (error) {
 864                 if (intrflg && nfs_sigintr(nmp, NULL, td)) {
 865                         np->n_flag &= ~NFLUSHINPROG;
 866                         if (np->n_flag & NFLUSHWANT) {
 867                                 np->n_flag &= ~NFLUSHWANT;
 868                                 wakeup((caddr_t)&np->n_flag);
 869                         }
 870                         return (EINTR);
 871                 }
 872                 error = vinvalbuf(vp, flags, 0, slptimeo);
 873         }
 874         np->n_flag &= ~(NLMODIFIED | NFLUSHINPROG);
 875         if (np->n_flag & NFLUSHWANT) {
 876                 np->n_flag &= ~NFLUSHWANT;
 877                 wakeup((caddr_t)&np->n_flag);
 878         }
 879         return (0);
 880 }
 881
 882 /*
 883  * Return true (non-zero) if the txthread and rxthread are operational
 884  * and we do not already have too many not-yet-started BIO's built up.
 885  */
 886 int
 887 nfs_asyncok(struct nfsmount *nmp)
 888 {
 889         return (nmp->nm_bioqlen < nfs_maxasyncbio &&
 890                 nmp->nm_bioqlen < nmp->nm_maxasync_scaled / NFS_ASYSCALE &&
 891                 nmp->nm_rxstate <= NFSSVC_PENDING &&
 892                 nmp->nm_txstate <= NFSSVC_PENDING);
 893 }
 894
 895 /*
 896  * The read-ahead code calls this to queue a bio to the txthread.
 897  *
 898  * We don't touch the bio otherwise... that is, we do not even
 899  * construct or send the initial rpc.  The txthread will do it
 900  * for us.
 901  *
 902  * NOTE!  nm_bioqlen is not decremented until the request completes,
 903  *        so it does not reflect the number of bio's on bioq.
 904  */
 905 void
 906 nfs_asyncio(struct vnode *vp, struct bio *bio)
 907 {
 908         struct buf *bp = bio->bio_buf;
 909         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 910
 911         KKASSERT(vp->v_tag == VT_NFS);
 912         BUF_KERNPROC(bp);
 913         bio->bio_driver_info = vp;
 914         crit_enter();
 915         TAILQ_INSERT_TAIL(&nmp->nm_bioq, bio, bio_act);
 916         atomic_add_int(&nmp->nm_bioqlen, 1);
 917         crit_exit();
 918         nfssvc_iod_writer_wakeup(nmp);
 919 }
 920
 921 /*
 922  * nfs_dio()    - Execute a BIO operation synchronously.  The BIO will be
 923  *                completed and its error returned.  The caller is responsible
 924  *                for brelse()ing it.  ONLY USE FOR BIO_SYNC IOs!  Otherwise
 925  *                our error probe will be against an invalid pointer.
 926  *
 927  * nfs_startio()- Execute a BIO operation assynchronously.
 928  *
 929  * NOTE: nfs_asyncio() is used to initiate an asynchronous BIO operation,
 930  *       which basically just queues it to the txthread.  nfs_startio()
 931  *       actually initiates the I/O AFTER it has gotten to the txthread.
 932  *
 933  * NOTE: td might be NULL.
 934  *
 935  * NOTE: Caller has already busied the I/O.
 936  */
 937 void
 938 nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td)
 939 {
 940         struct buf *bp = bio->bio_buf;
 941         struct nfsnode *np;
 942         struct nfsmount *nmp;
 943
 944         KKASSERT(vp->v_tag == VT_NFS);
 945         np = VTONFS(vp);
 946         nmp = VFSTONFS(vp->v_mount);
 947
 948         /*
 949          * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
 950          * do this here so we do not have to do it in all the code that
 951          * calls us.
 952          */
 953         bp->b_flags &= ~(B_ERROR | B_INVAL);
 954
 955         KASSERT(bp->b_cmd != BUF_CMD_DONE,
 956                 ("nfs_doio: bp %p already marked done!", bp));
 957
 958         if (bp->b_cmd == BUF_CMD_READ) {
 959             switch (vp->v_type) {
 960             case VREG:
 961                 nfsstats.read_bios++;
 962                 nfs_readrpc_bio(vp, bio);
 963                 break;
 964             case VLNK:
 965 #if 0
 966                 bio->bio_offset = 0;
 967                 nfsstats.readlink_bios++;
 968                 nfs_readlinkrpc_bio(vp, bio);
 969 #else
 970                 nfs_doio(vp, bio, td);
 971 #endif
 972                 break;
 973             case VDIR:
 974                 /*
 975                  * NOTE: If nfs_readdirplusrpc_bio() is requested but
 976                  *       not supported, it will chain to
 977                  *       nfs_readdirrpc_bio().
 978                  */
 979 #if 0
 980                 nfsstats.readdir_bios++;
 981                 uiop->uio_offset = bio->bio_offset;
 982                 if (nmp->nm_flag & NFSMNT_RDIRPLUS)
 983                         nfs_readdirplusrpc_bio(vp, bio);
 984                 else
 985                         nfs_readdirrpc_bio(vp, bio);
 986 #else
 987                 nfs_doio(vp, bio, td);
 988 #endif
 989                 break;
 990             default:
 991                 kprintf("nfs_doio:  type %x unexpected\n",vp->v_type);
 992                 bp->b_flags |= B_ERROR;
 993                 bp->b_error = EINVAL;
 994                 biodone(bio);
 995                 break;
 996             }
 997         } else {
 998             /*
 999              * If we only need to commit, try to commit.  If this fails
1000              * it will chain through to the write.  Basically all the logic
1001              * in nfs_doio() is replicated.
1002              */
1003             KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
1004             if (bp->b_flags & B_NEEDCOMMIT)
1005                 nfs_commitrpc_bio(vp, bio);
1006             else
1007                 nfs_writerpc_bio(vp, bio);
1008         }
1009 }
1010
1011 int
1012 nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
1013 {
1014         struct buf *bp = bio->bio_buf;
1015         struct uio *uiop;
1016         struct nfsnode *np;
1017         struct nfsmount *nmp;
1018         int error = 0;
1019         int iomode, must_commit;
1020         size_t n;
1021         struct uio uio;
1022         struct iovec io;
1023
1024         KKASSERT(vp->v_tag == VT_NFS);
1025         np = VTONFS(vp);
1026         nmp = VFSTONFS(vp->v_mount);
1027         uiop = &uio;
1028         uiop->uio_iov = &io;
1029         uiop->uio_iovcnt = 1;
1030         uiop->uio_segflg = UIO_SYSSPACE;
1031         uiop->uio_td = td;
1032
1033         /*
1034          * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
1035          * do this here so we do not have to do it in all the code that
1036          * calls us.
1037          */
1038         bp->b_flags &= ~(B_ERROR | B_INVAL);
1039
1040         KASSERT(bp->b_cmd != BUF_CMD_DONE,
1041                 ("nfs_doio: bp %p already marked done!", bp));
1042
1043         if (bp->b_cmd == BUF_CMD_READ) {
1044             io.iov_len = uiop->uio_resid = (size_t)bp->b_bcount;
1045             io.iov_base = bp->b_data;
1046             uiop->uio_rw = UIO_READ;
1047
1048             switch (vp->v_type) {
1049             case VREG:
1050                 /*
1051                  * When reading from a regular file zero-fill any residual.
1052                  * Note that this residual has nothing to do with NFS short
1053                  * reads, which nfs_readrpc_uio() will handle for us.
1054                  *
1055                  * We have to do this because when we are write extending
1056                  * a file the server may not have the same notion of
1057                  * filesize as we do.  Our BIOs should already be sized
1058                  * (b_bcount) to account for the file EOF.
1059                  */
1060                 nfsstats.read_bios++;
1061                 uiop->uio_offset = bio->bio_offset;
1062                 error = nfs_readrpc_uio(vp, uiop);
1063                 if (error == 0 && uiop->uio_resid) {
1064                         n = (size_t)bp->b_bcount - uiop->uio_resid;
1065                         bzero(bp->b_data + n, bp->b_bcount - n);
1066                         uiop->uio_resid = 0;
1067                 }
1068                 if (td && td->td_proc && (vp->v_flag & VTEXT) &&
1069                     np->n_mtime != np->n_vattr.va_mtime.tv_sec) {
1070                         uprintf("Process killed due to text file modification\n");
1071                         ksignal(td->td_proc, SIGKILL);
1072                 }
1073                 break;
1074             case VLNK:
1075                 uiop->uio_offset = 0;
1076                 nfsstats.readlink_bios++;
1077                 error = nfs_readlinkrpc_uio(vp, uiop);
1078                 break;
1079             case VDIR:
1080                 nfsstats.readdir_bios++;
1081                 uiop->uio_offset = bio->bio_offset;
1082                 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1083                         error = nfs_readdirplusrpc_uio(vp, uiop);
1084                         if (error == NFSERR_NOTSUPP)
1085                                 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1086                 }
1087                 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1088                         error = nfs_readdirrpc_uio(vp, uiop);
1089                 /*
1090                  * end-of-directory sets B_INVAL but does not generate an
1091                  * error.
1092                  */
1093                 if (error == 0 && uiop->uio_resid == bp->b_bcount)
1094                         bp->b_flags |= B_INVAL;
1095                 break;
1096             default:
1097                 kprintf("nfs_doio:  type %x unexpected\n",vp->v_type);
1098                 break;
1099             };
1100             if (error) {
1101                 bp->b_flags |= B_ERROR;
1102                 bp->b_error = error;
1103             }
1104             bp->b_resid = uiop->uio_resid;
1105         } else {
1106             /*
1107              * If we only need to commit, try to commit.
1108              *
1109              * NOTE: The I/O has already been staged for the write and
1110              *       its pages busied, so b_dirtyoff/end is valid.
1111              */
1112             KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
1113             if (bp->b_flags & B_NEEDCOMMIT) {
1114                     int retv;
1115                     off_t off;
1116
1117                     off = bio->bio_offset + bp->b_dirtyoff;
1118                     retv = nfs_commitrpc_uio(vp, off,
1119                                              bp->b_dirtyend - bp->b_dirtyoff,
1120                                              td);
1121                     if (retv == 0) {
1122                             bp->b_dirtyoff = bp->b_dirtyend = 0;
1123                             bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1124                             bp->b_resid = 0;
1125                             biodone(bio);
1126                             return(0);
1127                     }
1128                     if (retv == NFSERR_STALEWRITEVERF) {
1129                             nfs_clearcommit(vp->v_mount);
1130                     }
1131             }
1132
1133             /*
1134              * Setup for actual write
1135              */
1136             if (bio->bio_offset + bp->b_dirtyend > np->n_size)
1137                 bp->b_dirtyend = np->n_size - bio->bio_offset;
1138
1139             if (bp->b_dirtyend > bp->b_dirtyoff) {
1140                 io.iov_len = uiop->uio_resid = bp->b_dirtyend
1141                     - bp->b_dirtyoff;
1142                 uiop->uio_offset = bio->bio_offset + bp->b_dirtyoff;
1143                 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1144                 uiop->uio_rw = UIO_WRITE;
1145                 nfsstats.write_bios++;
1146
1147                 if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0)
1148                     iomode = NFSV3WRITE_UNSTABLE;
1149                 else
1150                     iomode = NFSV3WRITE_FILESYNC;
1151
1152                 must_commit = 0;
1153                 error = nfs_writerpc_uio(vp, uiop, &iomode, &must_commit);
1154
1155                 /*
1156                  * We no longer try to use kern/vfs_bio's cluster code to
1157                  * cluster commits, so B_CLUSTEROK is no longer set with
1158                  * B_NEEDCOMMIT.  The problem is that a vfs_busy_pages()
1159                  * may have to clear B_NEEDCOMMIT if it finds underlying
1160                  * pages have been redirtied through a memory mapping
1161                  * and doing this on a clustered bp will probably cause
1162                  * a panic, plus the flag in the underlying NFS bufs
1163                  * making up the cluster bp will not be properly cleared.
1164                  */
1165                 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1166                     bp->b_flags |= B_NEEDCOMMIT;
1167 #if 0
1168                     /* XXX do not enable commit clustering */
1169                     if (bp->b_dirtyoff == 0
1170                         && bp->b_dirtyend == bp->b_bcount)
1171                         bp->b_flags |= B_CLUSTEROK;
1172 #endif
1173                 } else {
1174                     bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1175                 }
1176
1177                 /*
1178                  * For an interrupted write, the buffer is still valid
1179                  * and the write hasn't been pushed to the server yet,
1180                  * so we can't set B_ERROR and report the interruption
1181                  * by setting B_EINTR. For the async case, B_EINTR
1182                  * is not relevant, so the rpc attempt is essentially
1183                  * a noop.  For the case of a V3 write rpc not being
1184                  * committed to stable storage, the block is still
1185                  * dirty and requires either a commit rpc or another
1186                  * write rpc with iomode == NFSV3WRITE_FILESYNC before
1187                  * the block is reused. This is indicated by setting
1188                  * the B_DELWRI and B_NEEDCOMMIT flags.
1189                  *
1190                  * If the buffer is marked B_PAGING, it does not reside on
1191                  * the vp's paging queues so we cannot call bdirty().  The
1192                  * bp in this case is not an NFS cache block so we should
1193                  * be safe. XXX
1194                  */
1195                 if (error == EINTR
1196                     || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1197                         crit_enter();
1198                         bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1199                         if ((bp->b_flags & B_PAGING) == 0)
1200                             bdirty(bp);
1201                         if (error)
1202                             bp->b_flags |= B_EINTR;
1203                         crit_exit();
1204                 } else {
1205                     if (error) {
1206                         bp->b_flags |= B_ERROR;
1207                         bp->b_error = np->n_error = error;
1208                         np->n_flag |= NWRITEERR;
1209                     }
1210                     bp->b_dirtyoff = bp->b_dirtyend = 0;
1211                 }
1212                 if (must_commit)
1213                     nfs_clearcommit(vp->v_mount);
1214                 bp->b_resid = uiop->uio_resid;
1215             } else {
1216                 bp->b_resid = 0;
1217             }
1218         }
1219
1220         /*
1221          * I/O was run synchronously, biodone() it and calculate the
1222          * error to return.
1223          */
1224         biodone(bio);
1225         KKASSERT(bp->b_cmd == BUF_CMD_DONE);
1226         if (bp->b_flags & B_EINTR)
1227                 return (EINTR);
1228         if (bp->b_flags & B_ERROR)
1229                 return (bp->b_error ? bp->b_error : EIO);
1230         return (0);
1231 }
1232
1233 /*
1234  * Handle all truncation, write-extend, and ftruncate()-extend operations
1235  * on the NFS lcient side.
1236  *
1237  * We use the new API in kern/vfs_vm.c to perform these operations in a
1238  * VM-friendly way.  With this API VM pages are properly zerod and pages
1239  * still mapped into the buffer straddling EOF are not invalidated.
1240  */
1241 int
1242 nfs_meta_setsize(struct vnode *vp, struct thread *td, off_t nsize, int trivial)
1243 {
1244         struct nfsnode *np = VTONFS(vp);
1245         off_t osize;
1246         int biosize = vp->v_mount->mnt_stat.f_iosize;
1247         int error;
1248
1249         osize = np->n_size;
1250         np->n_size = nsize;
1251
1252         if (nsize < osize) {
1253                 error = nvtruncbuf(vp, nsize, biosize, -1);
1254         } else {
1255                 error = nvextendbuf(vp, osize, nsize,
1256                                     biosize, biosize, -1, -1,
1257                                     trivial);
1258         }
1259         return(error);
1260 }
1261
1262 /*
1263  * Synchronous completion for nfs_doio.  Call bpdone() with elseit=FALSE.
1264  * Caller is responsible for brelse()'ing the bp.
1265  */
1266 static void
1267 nfsiodone_sync(struct bio *bio)
1268 {
1269         bio->bio_flags = 0;
1270         bpdone(bio->bio_buf, 0);
1271 }
1272
1273 /*
1274  * nfs read rpc - BIO version
1275  */
1276 void
1277 nfs_readrpc_bio(struct vnode *vp, struct bio *bio)
1278 {
1279         struct buf *bp = bio->bio_buf;
1280         u_int32_t *tl;
1281         struct nfsmount *nmp;
1282         int error = 0, len, tsiz;
1283         struct nfsm_info *info;
1284
1285         info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
1286         info->mrep = NULL;
1287         info->v3 = NFS_ISV3(vp);
1288
1289         nmp = VFSTONFS(vp->v_mount);
1290         tsiz = bp->b_bcount;
1291         KKASSERT(tsiz <= nmp->nm_rsize);
1292         if (bio->bio_offset + tsiz > nmp->nm_maxfilesize) {
1293                 error = EFBIG;
1294                 goto nfsmout;
1295         }
1296         nfsstats.rpccnt[NFSPROC_READ]++;
1297         len = tsiz;
1298         nfsm_reqhead(info, vp, NFSPROC_READ,
1299                      NFSX_FH(info->v3) + NFSX_UNSIGNED * 3);
1300         ERROROUT(nfsm_fhtom(info, vp));
1301         tl = nfsm_build(info, NFSX_UNSIGNED * 3);
1302         if (info->v3) {
1303                 txdr_hyper(bio->bio_offset, tl);
1304                 *(tl + 2) = txdr_unsigned(len);
1305         } else {
1306                 *tl++ = txdr_unsigned(bio->bio_offset);
1307                 *tl++ = txdr_unsigned(len);
1308                 *tl = 0;
1309         }
1310         info->bio = bio;
1311         info->done = nfs_readrpc_bio_done;
1312         nfsm_request_bio(info, vp, NFSPROC_READ, NULL,
1313                          nfs_vpcred(vp, ND_READ));
1314         return;
1315 nfsmout:
1316         kfree(info, M_NFSREQ);
1317         bp->b_error = error;
1318         bp->b_flags |= B_ERROR;
1319         biodone(bio);
1320 }
1321
1322 static void
1323 nfs_readrpc_bio_done(nfsm_info_t info)
1324 {
1325         struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
1326         struct bio *bio = info->bio;
1327         struct buf *bp = bio->bio_buf;
1328         u_int32_t *tl;
1329         int attrflag;
1330         int retlen;
1331         int eof;
1332         int error = 0;
1333
1334         KKASSERT(info->state == NFSM_STATE_DONE);
1335
1336         if (info->v3) {
1337                 ERROROUT(nfsm_postop_attr(info, info->vp, &attrflag,
1338                                          NFS_LATTR_NOSHRINK));
1339                 NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED));
1340                 eof = fxdr_unsigned(int, *(tl + 1));
1341         } else {
1342                 ERROROUT(nfsm_loadattr(info, info->vp, NULL));
1343                 eof = 0;
1344         }
1345         NEGATIVEOUT(retlen = nfsm_strsiz(info, nmp->nm_rsize));
1346         ERROROUT(nfsm_mtobio(info, bio, retlen));
1347         m_freem(info->mrep);
1348         info->mrep = NULL;
1349
1350         /*
1351          * No error occured, if retlen is less then bcount and no EOF
1352          * and NFSv3 a zero-fill short read occured.
1353          *
1354          * For NFSv2 a short-read indicates EOF.
1355          */
1356         if (retlen < bp->b_bcount && info->v3 && eof == 0) {
1357                 bzero(bp->b_data + retlen, bp->b_bcount - retlen);
1358                 retlen = bp->b_bcount;
1359         }
1360
1361         /*
1362          * If we hit an EOF we still zero-fill, but return the expected
1363          * b_resid anyway.  This should normally not occur since async
1364          * BIOs are not used for read-before-write case.  Races against
1365          * the server can cause it though and we don't want to leave
1366          * garbage in the buffer.
1367          */
1368         if (retlen < bp->b_bcount) {
1369                 bzero(bp->b_data + retlen, bp->b_bcount - retlen);
1370         }
1371         bp->b_resid = 0;
1372         /* bp->b_resid = bp->b_bcount - retlen; */
1373 nfsmout:
1374         kfree(info, M_NFSREQ);
1375         if (error) {
1376                 bp->b_error = error;
1377                 bp->b_flags |= B_ERROR;
1378         }
1379         biodone(bio);
1380 }
1381
1382 /*
1383  * nfs write call - BIO version
1384  *
1385  * NOTE: Caller has already busied the I/O.
1386  */
1387 void
1388 nfs_writerpc_bio(struct vnode *vp, struct bio *bio)
1389 {
1390         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1391         struct nfsnode *np = VTONFS(vp);
1392         struct buf *bp = bio->bio_buf;
1393         u_int32_t *tl;
1394         int len;
1395         int iomode;
1396         int error = 0;
1397         struct nfsm_info *info;
1398         off_t offset;
1399
1400         /*
1401          * Setup for actual write.  Just clean up the bio if there
1402          * is nothing to do.  b_dirtyoff/end have already been staged
1403          * by the bp's pages getting busied.
1404          */
1405         if (bio->bio_offset + bp->b_dirtyend > np->n_size)
1406                 bp->b_dirtyend = np->n_size - bio->bio_offset;
1407
1408         if (bp->b_dirtyend <= bp->b_dirtyoff) {
1409                 bp->b_resid = 0;
1410                 biodone(bio);
1411                 return;
1412         }
1413         len = bp->b_dirtyend - bp->b_dirtyoff;
1414         offset = bio->bio_offset + bp->b_dirtyoff;
1415         if (offset + len > nmp->nm_maxfilesize) {
1416                 bp->b_flags |= B_ERROR;
1417                 bp->b_error = EFBIG;
1418                 biodone(bio);
1419                 return;
1420         }
1421         bp->b_resid = len;
1422         nfsstats.write_bios++;
1423
1424         info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
1425         info->mrep = NULL;
1426         info->v3 = NFS_ISV3(vp);
1427         info->info_writerpc.must_commit = 0;
1428         if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0)
1429                 iomode = NFSV3WRITE_UNSTABLE;
1430         else
1431                 iomode = NFSV3WRITE_FILESYNC;
1432
1433         KKASSERT(len <= nmp->nm_wsize);
1434
1435         nfsstats.rpccnt[NFSPROC_WRITE]++;
1436         nfsm_reqhead(info, vp, NFSPROC_WRITE,
1437                      NFSX_FH(info->v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
1438         ERROROUT(nfsm_fhtom(info, vp));
1439         if (info->v3) {
1440                 tl = nfsm_build(info, 5 * NFSX_UNSIGNED);
1441                 txdr_hyper(offset, tl);
1442                 tl += 2;
1443                 *tl++ = txdr_unsigned(len);
1444                 *tl++ = txdr_unsigned(iomode);
1445                 *tl = txdr_unsigned(len);
1446         } else {
1447                 u_int32_t x;
1448
1449                 tl = nfsm_build(info, 4 * NFSX_UNSIGNED);
1450                 /* Set both "begin" and "current" to non-garbage. */
1451                 x = txdr_unsigned((u_int32_t)offset);
1452                 *tl++ = x;      /* "begin offset" */
1453                 *tl++ = x;      /* "current offset" */
1454                 x = txdr_unsigned(len);
1455                 *tl++ = x;      /* total to this offset */
1456                 *tl = x;        /* size of this write */
1457         }
1458         ERROROUT(nfsm_biotom(info, bio, bp->b_dirtyoff, len));
1459         info->bio = bio;
1460         info->done = nfs_writerpc_bio_done;
1461         nfsm_request_bio(info, vp, NFSPROC_WRITE, NULL,
1462                          nfs_vpcred(vp, ND_WRITE));
1463         return;
1464 nfsmout:
1465         kfree(info, M_NFSREQ);
1466         bp->b_error = error;
1467         bp->b_flags |= B_ERROR;
1468         biodone(bio);
1469 }
1470
1471 static void
1472 nfs_writerpc_bio_done(nfsm_info_t info)
1473 {
1474         struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
1475         struct nfsnode *np = VTONFS(info->vp);
1476         struct bio *bio = info->bio;
1477         struct buf *bp = bio->bio_buf;
1478         int wccflag = NFSV3_WCCRATTR;
1479         int iomode = NFSV3WRITE_FILESYNC;
1480         int commit;
1481         int rlen;
1482         int error;
1483         int len = bp->b_resid;  /* b_resid was set to shortened length */
1484         u_int32_t *tl;
1485
1486         if (info->v3) {
1487                 /*
1488                  * The write RPC returns a before and after mtime.  The
1489                  * nfsm_wcc_data() macro checks the before n_mtime
1490                  * against the before time and stores the after time
1491                  * in the nfsnode's cached vattr and n_mtime field.
1492                  * The NRMODIFIED bit will be set if the before
1493                  * time did not match the original mtime.
1494                  */
1495                 wccflag = NFSV3_WCCCHK;
1496                 ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag));
1497                 if (error == 0) {
1498                         NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF));
1499                         rlen = fxdr_unsigned(int, *tl++);
1500                         if (rlen == 0) {
1501                                 error = NFSERR_IO;
1502                                 m_freem(info->mrep);
1503                                 info->mrep = NULL;
1504                                 goto nfsmout;
1505                         } else if (rlen < len) {
1506 #if 0
1507                                 /*
1508                                  * XXX what do we do here?
1509                                  */
1510                                 backup = len - rlen;
1511                                 uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup;
1512                                 uiop->uio_iov->iov_len += backup;
1513                                 uiop->uio_offset -= backup;
1514                                 uiop->uio_resid += backup;
1515                                 len = rlen;
1516 #endif
1517                         }
1518                         commit = fxdr_unsigned(int, *tl++);
1519
1520                         /*
1521                          * Return the lowest committment level
1522                          * obtained by any of the RPCs.
1523                          */
1524                         if (iomode == NFSV3WRITE_FILESYNC)
1525                                 iomode = commit;
1526                         else if (iomode == NFSV3WRITE_DATASYNC &&
1527                                 commit == NFSV3WRITE_UNSTABLE)
1528                                 iomode = commit;
1529                         if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){
1530                             bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF);
1531                             nmp->nm_state |= NFSSTA_HASWRITEVERF;
1532                         } else if (bcmp(tl, nmp->nm_verf, NFSX_V3WRITEVERF)) {
1533                             info->info_writerpc.must_commit = 1;
1534                             bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF);
1535                         }
1536                 }
1537         } else {
1538                 ERROROUT(nfsm_loadattr(info, info->vp, NULL));
1539         }
1540         m_freem(info->mrep);
1541         info->mrep = NULL;
1542         len = 0;
1543 nfsmout:
1544         if (info->vp->v_mount->mnt_flag & MNT_ASYNC)
1545                 iomode = NFSV3WRITE_FILESYNC;
1546         bp->b_resid = len;
1547
1548         /*
1549          * End of RPC.  Now clean up the bp.
1550          *
1551          * We no longer enable write clustering for commit operations,
1552          * See around line 1157 for a more detailed comment.
1553          */
1554         if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1555                 bp->b_flags |= B_NEEDCOMMIT;
1556 #if 0
1557                 /* XXX do not enable commit clustering */
1558                 if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount)
1559                         bp->b_flags |= B_CLUSTEROK;
1560 #endif
1561         } else {
1562                 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1563         }
1564
1565         /*
1566          * For an interrupted write, the buffer is still valid
1567          * and the write hasn't been pushed to the server yet,
1568          * so we can't set B_ERROR and report the interruption
1569          * by setting B_EINTR. For the async case, B_EINTR
1570          * is not relevant, so the rpc attempt is essentially
1571          * a noop.  For the case of a V3 write rpc not being
1572          * committed to stable storage, the block is still
1573          * dirty and requires either a commit rpc or another
1574          * write rpc with iomode == NFSV3WRITE_FILESYNC before
1575          * the block is reused. This is indicated by setting
1576          * the B_DELWRI and B_NEEDCOMMIT flags.
1577          *
1578          * If the buffer is marked B_PAGING, it does not reside on
1579          * the vp's paging queues so we cannot call bdirty().  The
1580          * bp in this case is not an NFS cache block so we should
1581          * be safe. XXX
1582          */
1583         if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1584                 crit_enter();
1585                 bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1586                 if ((bp->b_flags & B_PAGING) == 0)
1587                         bdirty(bp);
1588                 if (error)
1589                         bp->b_flags |= B_EINTR;
1590                 crit_exit();
1591         } else {
1592                 if (error) {
1593                         bp->b_flags |= B_ERROR;
1594                         bp->b_error = np->n_error = error;
1595                         np->n_flag |= NWRITEERR;
1596                 }
1597                 bp->b_dirtyoff = bp->b_dirtyend = 0;
1598         }
1599         if (info->info_writerpc.must_commit)
1600                 nfs_clearcommit(info->vp->v_mount);
1601         kfree(info, M_NFSREQ);
1602         if (error) {
1603                 bp->b_flags |= B_ERROR;
1604                 bp->b_error = error;
1605         }
1606         biodone(bio);
1607 }
1608
1609 /*
1610  * Nfs Version 3 commit rpc - BIO version
1611  *
1612  * This function issues the commit rpc and will chain to a write
1613  * rpc if necessary.
1614  */
1615 void
1616 nfs_commitrpc_bio(struct vnode *vp, struct bio *bio)
1617 {
1618         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
1619         struct buf *bp = bio->bio_buf;
1620         struct nfsm_info *info;
1621         int error = 0;
1622         u_int32_t *tl;
1623
1624         if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) {
1625                 bp->b_dirtyoff = bp->b_dirtyend = 0;
1626                 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1627                 bp->b_resid = 0;
1628                 biodone(bio);
1629                 return;
1630         }
1631
1632         info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK);
1633         info->mrep = NULL;
1634         info->v3 = 1;
1635
1636         nfsstats.rpccnt[NFSPROC_COMMIT]++;
1637         nfsm_reqhead(info, vp, NFSPROC_COMMIT, NFSX_FH(1));
1638         ERROROUT(nfsm_fhtom(info, vp));
1639         tl = nfsm_build(info, 3 * NFSX_UNSIGNED);
1640         txdr_hyper(bio->bio_offset + bp->b_dirtyoff, tl);
1641         tl += 2;
1642         *tl = txdr_unsigned(bp->b_dirtyend - bp->b_dirtyoff);
1643         info->bio = bio;
1644         info->done = nfs_commitrpc_bio_done;
1645         nfsm_request_bio(info, vp, NFSPROC_COMMIT, NULL,
1646                          nfs_vpcred(vp, ND_WRITE));
1647         return;
1648 nfsmout:
1649         /*
1650          * Chain to write RPC on (early) error
1651          */
1652         kfree(info, M_NFSREQ);
1653         nfs_writerpc_bio(vp, bio);
1654 }
1655
1656 static void
1657 nfs_commitrpc_bio_done(nfsm_info_t info)
1658 {
1659         struct nfsmount *nmp = VFSTONFS(info->vp->v_mount);
1660         struct bio *bio = info->bio;
1661         struct buf *bp = bio->bio_buf;
1662         u_int32_t *tl;
1663         int wccflag = NFSV3_WCCRATTR;
1664         int error = 0;
1665
1666         ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag));
1667         if (error == 0) {
1668                 NULLOUT(tl = nfsm_dissect(info, NFSX_V3WRITEVERF));
1669                 if (bcmp(nmp->nm_verf, tl, NFSX_V3WRITEVERF)) {
1670                         bcopy(tl, nmp->nm_verf, NFSX_V3WRITEVERF);
1671                         error = NFSERR_STALEWRITEVERF;
1672                 }
1673         }
1674         m_freem(info->mrep);
1675         info->mrep = NULL;
1676
1677         /*
1678          * On completion we must chain to a write bio if an
1679          * error occurred.
1680          */
1681 nfsmout:
1682         kfree(info, M_NFSREQ);
1683         if (error == 0) {
1684                 bp->b_dirtyoff = bp->b_dirtyend = 0;
1685                 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1686                 bp->b_resid = 0;
1687                 biodone(bio);
1688         } else {
1689                 nfs_writerpc_bio(info->vp, bio);
1690         }
1691 }
1692