kernel/fs/nfs/nfs_srv.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2016 by Delphix. All rights reserved.
  25  */
  26
  27 /*
  28  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  29  *      All rights reserved.
  30  */
  31
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/cred.h>
  36 #include <sys/buf.h>
  37 #include <sys/vfs.h>
  38 #include <sys/vnode.h>
  39 #include <sys/uio.h>
  40 #include <sys/stat.h>
  41 #include <sys/errno.h>
  42 #include <sys/sysmacros.h>
  43 #include <sys/statvfs.h>
  44 #include <sys/kmem.h>
  45 #include <sys/kstat.h>
  46 #include <sys/dirent.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/debug.h>
  49 #include <sys/vtrace.h>
  50 #include <sys/mode.h>
  51 #include <sys/acl.h>
  52 #include <sys/nbmlock.h>
  53 #include <sys/policy.h>
  54 #include <sys/sdt.h>
  55
  56 #include <rpc/types.h>
  57 #include <rpc/auth.h>
  58 #include <rpc/svc.h>
  59
  60 #include <nfs/nfs.h>
  61 #include <nfs/export.h>
  62 #include <nfs/nfs_cmd.h>
  63
  64 #include <vm/hat.h>
  65 #include <vm/as.h>
  66 #include <vm/seg.h>
  67 #include <vm/seg_map.h>
  68 #include <vm/seg_kmem.h>
  69
  70 #include <sys/strsubr.h>
  71
  72 /*
  73  * These are the interface routines for the server side of the
  74  * Network File System.  See the NFS version 2 protocol specification
  75  * for a description of this interface.
  76  */
  77
  78 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  79 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  80                         cred_t *);
  81
  82 /*
  83  * Some "over the wire" UNIX file types.  These are encoded
  84  * into the mode.  This needs to be fixed in the next rev.
  85  */
  86 #define IFMT            0170000         /* type of file */
  87 #define IFCHR           0020000         /* character special */
  88 #define IFBLK           0060000         /* block special */
  89 #define IFSOCK          0140000         /* socket */
  90
  91 u_longlong_t nfs2_srv_caller_id;
  92
  93 /*
  94  * Get file attributes.
  95  * Returns the current attributes of the file with the given fhandle.
  96  */
  97 /* ARGSUSED */
  98 void
  99 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 100     struct svc_req *req, cred_t *cr, bool_t ro)
 101 {
 102         int error;
 103         vnode_t *vp;
 104         struct vattr va;
 105
 106         vp = nfs_fhtovp(fhp, exi);
 107         if (vp == NULL) {
 108                 ns->ns_status = NFSERR_STALE;
 109                 return;
 110         }
 111
 112         /*
 113          * Do the getattr.
 114          */
 115         va.va_mask = AT_ALL;    /* we want all the attributes */
 116
 117         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 118
 119         /* check for overflows */
 120         if (!error) {
 121                 /* Lie about the object type for a referral */
 122                 if (vn_is_nfs_reparse(vp, cr))
 123                         va.va_type = VLNK;
 124
 125                 acl_perm(vp, exi, &va, cr);
 126                 error = vattr_to_nattr(&va, &ns->ns_attr);
 127         }
 128
 129         VN_RELE(vp);
 130
 131         ns->ns_status = puterrno(error);
 132 }
 133 void *
 134 rfs_getattr_getfh(fhandle_t *fhp)
 135 {
 136         return (fhp);
 137 }
 138
 139 /*
 140  * Set file attributes.
 141  * Sets the attributes of the file with the given fhandle.  Returns
 142  * the new attributes.
 143  */
 144 /* ARGSUSED */
 145 void
 146 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 147     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 148 {
 149         int error;
 150         int flag;
 151         int in_crit = 0;
 152         vnode_t *vp;
 153         struct vattr va;
 154         struct vattr bva;
 155         struct flock64 bf;
 156         caller_context_t ct;
 157
 158
 159         vp = nfs_fhtovp(&args->saa_fh, exi);
 160         if (vp == NULL) {
 161                 ns->ns_status = NFSERR_STALE;
 162                 return;
 163         }
 164
 165         if (rdonly(ro, vp)) {
 166                 VN_RELE(vp);
 167                 ns->ns_status = NFSERR_ROFS;
 168                 return;
 169         }
 170
 171         error = sattr_to_vattr(&args->saa_sa, &va);
 172         if (error) {
 173                 VN_RELE(vp);
 174                 ns->ns_status = puterrno(error);
 175                 return;
 176         }
 177
 178         /*
 179          * If the client is requesting a change to the mtime,
 180          * but the nanosecond field is set to 1 billion, then
 181          * this is a flag to the server that it should set the
 182          * atime and mtime fields to the server's current time.
 183          * The 1 billion number actually came from the client
 184          * as 1 million, but the units in the over the wire
 185          * request are microseconds instead of nanoseconds.
 186          *
 187          * This is an overload of the protocol and should be
 188          * documented in the NFS Version 2 protocol specification.
 189          */
 190         if (va.va_mask & AT_MTIME) {
 191                 if (va.va_mtime.tv_nsec == 1000000000) {
 192                         gethrestime(&va.va_mtime);
 193                         va.va_atime = va.va_mtime;
 194                         va.va_mask |= AT_ATIME;
 195                         flag = 0;
 196                 } else
 197                         flag = ATTR_UTIME;
 198         } else
 199                 flag = 0;
 200
 201         /*
 202          * If the filesystem is exported with nosuid, then mask off
 203          * the setuid and setgid bits.
 204          */
 205         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 206             (exi->exi_export.ex_flags & EX_NOSUID))
 207                 va.va_mode &= ~(VSUID | VSGID);
 208
 209         ct.cc_sysid = 0;
 210         ct.cc_pid = 0;
 211         ct.cc_caller_id = nfs2_srv_caller_id;
 212         ct.cc_flags = CC_DONTBLOCK;
 213
 214         /*
 215          * We need to specially handle size changes because it is
 216          * possible for the client to create a file with modes
 217          * which indicate read-only, but with the file opened for
 218          * writing.  If the client then tries to set the size of
 219          * the file, then the normal access checking done in
 220          * fop_setattr would prevent the client from doing so,
 221          * although it should be legal for it to do so.  To get
 222          * around this, we do the access checking for ourselves
 223          * and then use fop_space which doesn't do the access
 224          * checking which fop_setattr does. fop_space can only
 225          * operate on VREG files, let fop_setattr handle the other
 226          * extremely rare cases.
 227          * Also the client should not be allowed to change the
 228          * size of the file if there is a conflicting non-blocking
 229          * mandatory lock in the region of change.
 230          */
 231         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 232                 if (nbl_need_check(vp)) {
 233                         nbl_start_crit(vp, RW_READER);
 234                         in_crit = 1;
 235                 }
 236
 237                 bva.va_mask = AT_UID | AT_SIZE;
 238
 239                 error = fop_getattr(vp, &bva, 0, cr, &ct);
 240
 241                 if (error) {
 242                         if (in_crit)
 243                                 nbl_end_crit(vp);
 244                         VN_RELE(vp);
 245                         ns->ns_status = puterrno(error);
 246                         return;
 247                 }
 248
 249                 if (in_crit) {
 250                         uoff_t offset;
 251                         ssize_t length;
 252
 253                         if (va.va_size < bva.va_size) {
 254                                 offset = va.va_size;
 255                                 length = bva.va_size - va.va_size;
 256                         } else {
 257                                 offset = bva.va_size;
 258                                 length = va.va_size - bva.va_size;
 259                         }
 260                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 261                             NULL)) {
 262                                 error = EACCES;
 263                         }
 264                 }
 265
 266                 if (crgetuid(cr) == bva.va_uid && !error &&
 267                     va.va_size != bva.va_size) {
 268                         va.va_mask &= ~AT_SIZE;
 269                         bf.l_type = F_WRLCK;
 270                         bf.l_whence = 0;
 271                         bf.l_start = (off64_t)va.va_size;
 272                         bf.l_len = 0;
 273                         bf.l_sysid = 0;
 274                         bf.l_pid = 0;
 275
 276                         error = fop_space(vp, F_FREESP, &bf, FWRITE,
 277                             (offset_t)va.va_size, cr, &ct);
 278                 }
 279                 if (in_crit)
 280                         nbl_end_crit(vp);
 281         } else
 282                 error = 0;
 283
 284         /*
 285          * Do the setattr.
 286          */
 287         if (!error && va.va_mask) {
 288                 error = fop_setattr(vp, &va, flag, cr, &ct);
 289         }
 290
 291         /*
 292          * check if the monitor on either vop_space or vop_setattr detected
 293          * a delegation conflict and if so, mark the thread flag as
 294          * wouldblock so that the response is dropped and the client will
 295          * try again.
 296          */
 297         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 298                 VN_RELE(vp);
 299                 curthread->t_flag |= T_WOULDBLOCK;
 300                 return;
 301         }
 302
 303         if (!error) {
 304                 va.va_mask = AT_ALL;    /* get everything */
 305
 306                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 307
 308                 /* check for overflows */
 309                 if (!error) {
 310                         acl_perm(vp, exi, &va, cr);
 311                         error = vattr_to_nattr(&va, &ns->ns_attr);
 312                 }
 313         }
 314
 315         ct.cc_flags = 0;
 316
 317         /*
 318          * Force modified metadata out to stable storage.
 319          */
 320         (void) fop_fsync(vp, FNODSYNC, cr, &ct);
 321
 322         VN_RELE(vp);
 323
 324         ns->ns_status = puterrno(error);
 325 }
 326 void *
 327 rfs_setattr_getfh(struct nfssaargs *args)
 328 {
 329         return (&args->saa_fh);
 330 }
 331
 332 /*
 333  * Directory lookup.
 334  * Returns an fhandle and file attributes for file name in a directory.
 335  */
 336 /* ARGSUSED */
 337 void
 338 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 339     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 340 {
 341         int error;
 342         vnode_t *dvp;
 343         vnode_t *vp;
 344         struct vattr va;
 345         fhandle_t *fhp = da->da_fhandle;
 346         struct sec_ol sec = {0, 0};
 347         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 348         char *name;
 349         struct sockaddr *ca;
 350
 351         /*
 352          * Disallow NULL paths
 353          */
 354         if (da->da_name == NULL || *da->da_name == '\0') {
 355                 dr->dr_status = NFSERR_ACCES;
 356                 return;
 357         }
 358
 359         /*
 360          * Allow lookups from the root - the default
 361          * location of the public filehandle.
 362          */
 363         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 364                 dvp = rootdir;
 365                 VN_HOLD(dvp);
 366         } else {
 367                 dvp = nfs_fhtovp(fhp, exi);
 368                 if (dvp == NULL) {
 369                         dr->dr_status = NFSERR_STALE;
 370                         return;
 371                 }
 372         }
 373
 374         /*
 375          * Not allow lookup beyond root.
 376          * If the filehandle matches a filehandle of the exi,
 377          * then the ".." refers beyond the root of an exported filesystem.
 378          */
 379         if (strcmp(da->da_name, "..") == 0 &&
 380             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 381                 VN_RELE(dvp);
 382                 dr->dr_status = NFSERR_NOENT;
 383                 return;
 384         }
 385
 386         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 387         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 388             MAXPATHLEN);
 389
 390         if (name == NULL) {
 391                 dr->dr_status = NFSERR_ACCES;
 392                 return;
 393         }
 394
 395         /*
 396          * If the public filehandle is used then allow
 397          * a multi-component lookup, i.e. evaluate
 398          * a pathname and follow symbolic links if
 399          * necessary.
 400          *
 401          * This may result in a vnode in another filesystem
 402          * which is OK as long as the filesystem is exported.
 403          */
 404         if (PUBLIC_FH2(fhp)) {
 405                 publicfh_flag = TRUE;
 406                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 407                     &sec);
 408         } else {
 409                 /*
 410                  * Do a normal single component lookup.
 411                  */
 412                 error = fop_lookup(dvp, name, &vp, NULL, 0, NULL, cr,
 413                     NULL, NULL, NULL);
 414         }
 415
 416         if (name != da->da_name)
 417                 kmem_free(name, MAXPATHLEN);
 418
 419
 420         if (!error) {
 421                 va.va_mask = AT_ALL;    /* we want everything */
 422
 423                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 424
 425                 /* check for overflows */
 426                 if (!error) {
 427                         acl_perm(vp, exi, &va, cr);
 428                         error = vattr_to_nattr(&va, &dr->dr_attr);
 429                         if (!error) {
 430                                 if (sec.sec_flags & SEC_QUERY)
 431                                         error = makefh_ol(&dr->dr_fhandle, exi,
 432                                             sec.sec_index);
 433                                 else {
 434                                         error = makefh(&dr->dr_fhandle, vp,
 435                                             exi);
 436                                         if (!error && publicfh_flag &&
 437                                             !chk_clnt_sec(exi, req))
 438                                                 auth_weak = TRUE;
 439                                 }
 440                         }
 441                 }
 442                 VN_RELE(vp);
 443         }
 444
 445         VN_RELE(dvp);
 446
 447         /*
 448          * If publicfh_flag is true then we have called rfs_publicfh_mclookup
 449          * and have obtained a new exportinfo in exi which needs to be
 450          * released. Note the the original exportinfo pointed to by exi
 451          * will be released by the caller, comon_dispatch.
 452          */
 453         if (publicfh_flag && exi != NULL)
 454                 exi_rele(exi);
 455
 456         /*
 457          * If it's public fh, no 0x81, and client's flavor is
 458          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 459          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 460          */
 461         if (auth_weak)
 462                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 463         else
 464                 dr->dr_status = puterrno(error);
 465 }
 466 void *
 467 rfs_lookup_getfh(struct nfsdiropargs *da)
 468 {
 469         return (da->da_fhandle);
 470 }
 471
 472 /*
 473  * Read symbolic link.
 474  * Returns the string in the symbolic link at the given fhandle.
 475  */
 476 /* ARGSUSED */
 477 void
 478 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 479     struct svc_req *req, cred_t *cr, bool_t ro)
 480 {
 481         int error;
 482         struct iovec iov;
 483         struct uio uio;
 484         vnode_t *vp;
 485         struct vattr va;
 486         struct sockaddr *ca;
 487         char *name = NULL;
 488         int is_referral = 0;
 489
 490         vp = nfs_fhtovp(fhp, exi);
 491         if (vp == NULL) {
 492                 rl->rl_data = NULL;
 493                 rl->rl_status = NFSERR_STALE;
 494                 return;
 495         }
 496
 497         va.va_mask = AT_MODE;
 498
 499         error = fop_getattr(vp, &va, 0, cr, NULL);
 500
 501         if (error) {
 502                 VN_RELE(vp);
 503                 rl->rl_data = NULL;
 504                 rl->rl_status = puterrno(error);
 505                 return;
 506         }
 507
 508         if (MANDLOCK(vp, va.va_mode)) {
 509                 VN_RELE(vp);
 510                 rl->rl_data = NULL;
 511                 rl->rl_status = NFSERR_ACCES;
 512                 return;
 513         }
 514
 515         /* We lied about the object type for a referral */
 516         if (vn_is_nfs_reparse(vp, cr))
 517                 is_referral = 1;
 518
 519         /*
 520          * XNFS and RFC1094 require us to return ENXIO if argument
 521          * is not a link. BUGID 1138002.
 522          */
 523         if (vp->v_type != VLNK && !is_referral) {
 524                 VN_RELE(vp);
 525                 rl->rl_data = NULL;
 526                 rl->rl_status = NFSERR_NXIO;
 527                 return;
 528         }
 529
 530         /*
 531          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 532          */
 533         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 534
 535         if (is_referral) {
 536                 char *s;
 537                 size_t strsz;
 538
 539                 /* Get an artificial symlink based on a referral */
 540                 s = build_symlink(vp, cr, &strsz);
 541                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 542                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 543                     vnode_t *, vp, char *, s);
 544                 if (s == NULL)
 545                         error = EINVAL;
 546                 else {
 547                         error = 0;
 548                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 549                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 550                         kmem_free(s, strsz);
 551                 }
 552
 553         } else {
 554
 555                 /*
 556                  * Set up io vector to read sym link data
 557                  */
 558                 iov.iov_base = rl->rl_data;
 559                 iov.iov_len = NFS_MAXPATHLEN;
 560                 uio.uio_iov = &iov;
 561                 uio.uio_iovcnt = 1;
 562                 uio.uio_segflg = UIO_SYSSPACE;
 563                 uio.uio_extflg = UIO_COPY_CACHED;
 564                 uio.uio_loffset = 0;
 565                 uio.uio_resid = NFS_MAXPATHLEN;
 566
 567                 /*
 568                  * Do the readlink.
 569                  */
 570                 error = fop_readlink(vp, &uio, cr, NULL);
 571
 572                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 573
 574                 if (!error)
 575                         rl->rl_data[rl->rl_count] = '\0';
 576
 577         }
 578
 579
 580         VN_RELE(vp);
 581
 582         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 583         name = nfscmd_convname(ca, exi, rl->rl_data,
 584             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 585
 586         if (name != NULL && name != rl->rl_data) {
 587                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 588                 rl->rl_data = name;
 589         }
 590
 591         /*
 592          * XNFS and RFC1094 require us to return ENXIO if argument
 593          * is not a link. UFS returns EINVAL if this is the case,
 594          * so we do the mapping here. BUGID 1138002.
 595          */
 596         if (error == EINVAL)
 597                 rl->rl_status = NFSERR_NXIO;
 598         else
 599                 rl->rl_status = puterrno(error);
 600
 601 }
 602 void *
 603 rfs_readlink_getfh(fhandle_t *fhp)
 604 {
 605         return (fhp);
 606 }
 607 /*
 608  * Free data allocated by rfs_readlink
 609  */
 610 void
 611 rfs_rlfree(struct nfsrdlnres *rl)
 612 {
 613         if (rl->rl_data != NULL)
 614                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 615 }
 616
 617 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 618
 619 /*
 620  * Read data.
 621  * Returns some data read from the file at the given fhandle.
 622  */
 623 /* ARGSUSED */
 624 void
 625 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 626     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 627 {
 628         vnode_t *vp;
 629         int error;
 630         struct vattr va;
 631         struct iovec iov;
 632         struct uio uio;
 633         mblk_t *mp;
 634         int alloc_err = 0;
 635         int in_crit = 0;
 636         caller_context_t ct;
 637
 638         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 639         if (vp == NULL) {
 640                 rr->rr_data = NULL;
 641                 rr->rr_status = NFSERR_STALE;
 642                 return;
 643         }
 644
 645         if (vp->v_type != VREG) {
 646                 VN_RELE(vp);
 647                 rr->rr_data = NULL;
 648                 rr->rr_status = NFSERR_ISDIR;
 649                 return;
 650         }
 651
 652         ct.cc_sysid = 0;
 653         ct.cc_pid = 0;
 654         ct.cc_caller_id = nfs2_srv_caller_id;
 655         ct.cc_flags = CC_DONTBLOCK;
 656
 657         /*
 658          * Enter the critical region before calling fop_rwlock
 659          * to avoid a deadlock with write requests.
 660          */
 661         if (nbl_need_check(vp)) {
 662                 nbl_start_crit(vp, RW_READER);
 663                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 664                     0, NULL)) {
 665                         nbl_end_crit(vp);
 666                         VN_RELE(vp);
 667                         rr->rr_data = NULL;
 668                         rr->rr_status = NFSERR_ACCES;
 669                         return;
 670                 }
 671                 in_crit = 1;
 672         }
 673
 674         error = fop_rwlock(vp, V_WRITELOCK_FALSE, &ct);
 675
 676         /* check if a monitor detected a delegation conflict */
 677         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 678                 VN_RELE(vp);
 679                 /* mark as wouldblock so response is dropped */
 680                 curthread->t_flag |= T_WOULDBLOCK;
 681
 682                 rr->rr_data = NULL;
 683                 return;
 684         }
 685
 686         va.va_mask = AT_ALL;
 687
 688         error = fop_getattr(vp, &va, 0, cr, &ct);
 689
 690         if (error) {
 691                 fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
 692                 if (in_crit)
 693                         nbl_end_crit(vp);
 694
 695                 VN_RELE(vp);
 696                 rr->rr_data = NULL;
 697                 rr->rr_status = puterrno(error);
 698
 699                 return;
 700         }
 701
 702         /*
 703          * This is a kludge to allow reading of files created
 704          * with no read permission.  The owner of the file
 705          * is always allowed to read it.
 706          */
 707         if (crgetuid(cr) != va.va_uid) {
 708                 error = fop_access(vp, VREAD, 0, cr, &ct);
 709
 710                 if (error) {
 711                         /*
 712                          * Exec is the same as read over the net because
 713                          * of demand loading.
 714                          */
 715                         error = fop_access(vp, VEXEC, 0, cr, &ct);
 716                 }
 717                 if (error) {
 718                         fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
 719                         if (in_crit)
 720                                 nbl_end_crit(vp);
 721                         VN_RELE(vp);
 722                         rr->rr_data = NULL;
 723                         rr->rr_status = puterrno(error);
 724
 725                         return;
 726                 }
 727         }
 728
 729         if (MANDLOCK(vp, va.va_mode)) {
 730                 fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
 731                 if (in_crit)
 732                         nbl_end_crit(vp);
 733
 734                 VN_RELE(vp);
 735                 rr->rr_data = NULL;
 736                 rr->rr_status = NFSERR_ACCES;
 737
 738                 return;
 739         }
 740
 741         rr->rr_ok.rrok_wlist_len = 0;
 742         rr->rr_ok.rrok_wlist = NULL;
 743
 744         if ((uoff_t)ra->ra_offset >= va.va_size) {
 745                 rr->rr_count = 0;
 746                 rr->rr_data = NULL;
 747                 /*
 748                  * In this case, status is NFS_OK, but there is no data
 749                  * to encode. So set rr_mp to NULL.
 750                  */
 751                 rr->rr_mp = NULL;
 752                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 753                 if (rr->rr_ok.rrok_wlist)
 754                         clist_zero_len(rr->rr_ok.rrok_wlist);
 755                 goto done;
 756         }
 757
 758         if (ra->ra_wlist) {
 759                 mp = NULL;
 760                 rr->rr_mp = NULL;
 761                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 762                 if (ra->ra_count > iov.iov_len) {
 763                         rr->rr_data = NULL;
 764                         rr->rr_status = NFSERR_INVAL;
 765                         goto done;
 766                 }
 767         } else {
 768                 /*
 769                  * mp will contain the data to be sent out in the read reply.
 770                  * This will be freed after the reply has been sent out (by the
 771                  * driver).
 772                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 773                  * that the call to xdrmblk_putmblk() never fails.
 774                  */
 775                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 776                     &alloc_err);
 777                 ASSERT(mp != NULL);
 778                 ASSERT(alloc_err == 0);
 779
 780                 rr->rr_mp = mp;
 781
 782                 /*
 783                  * Set up io vector
 784                  */
 785                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 786                 iov.iov_len = ra->ra_count;
 787         }
 788
 789         uio.uio_iov = &iov;
 790         uio.uio_iovcnt = 1;
 791         uio.uio_segflg = UIO_SYSSPACE;
 792         uio.uio_extflg = UIO_COPY_CACHED;
 793         uio.uio_loffset = (offset_t)ra->ra_offset;
 794         uio.uio_resid = ra->ra_count;
 795
 796         error = fop_read(vp, &uio, 0, cr, &ct);
 797
 798         if (error) {
 799                 if (mp)
 800                         freeb(mp);
 801
 802                 /*
 803                  * check if a monitor detected a delegation conflict and
 804                  * mark as wouldblock so response is dropped
 805                  */
 806                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 807                         curthread->t_flag |= T_WOULDBLOCK;
 808                 else
 809                         rr->rr_status = puterrno(error);
 810
 811                 fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
 812                 if (in_crit)
 813                         nbl_end_crit(vp);
 814
 815                 VN_RELE(vp);
 816                 rr->rr_data = NULL;
 817
 818                 return;
 819         }
 820
 821         /*
 822          * Get attributes again so we can send the latest access
 823          * time to the client side for its cache.
 824          */
 825         va.va_mask = AT_ALL;
 826
 827         error = fop_getattr(vp, &va, 0, cr, &ct);
 828
 829         if (error) {
 830                 if (mp)
 831                         freeb(mp);
 832
 833                 fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
 834                 if (in_crit)
 835                         nbl_end_crit(vp);
 836
 837                 VN_RELE(vp);
 838                 rr->rr_data = NULL;
 839                 rr->rr_status = puterrno(error);
 840
 841                 return;
 842         }
 843
 844         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 845
 846         if (mp) {
 847                 rr->rr_data = (char *)mp->b_datap->db_base;
 848         } else {
 849                 if (ra->ra_wlist) {
 850                         rr->rr_data = (caddr_t)iov.iov_base;
 851                         if (!rdma_setup_read_data2(ra, rr)) {
 852                                 rr->rr_data = NULL;
 853                                 rr->rr_status = puterrno(NFSERR_INVAL);
 854                         }
 855                 }
 856         }
 857 done:
 858         fop_rwunlock(vp, V_WRITELOCK_FALSE, &ct);
 859         if (in_crit)
 860                 nbl_end_crit(vp);
 861
 862         acl_perm(vp, exi, &va, cr);
 863
 864         /* check for overflows */
 865         error = vattr_to_nattr(&va, &rr->rr_attr);
 866
 867         VN_RELE(vp);
 868
 869         rr->rr_status = puterrno(error);
 870 }
 871
 872 /*
 873  * Free data allocated by rfs_read
 874  */
 875 void
 876 rfs_rdfree(struct nfsrdresult *rr)
 877 {
 878         mblk_t *mp;
 879
 880         if (rr->rr_status == NFS_OK) {
 881                 mp = rr->rr_mp;
 882                 if (mp != NULL)
 883                         freeb(mp);
 884         }
 885 }
 886
 887 void *
 888 rfs_read_getfh(struct nfsreadargs *ra)
 889 {
 890         return (&ra->ra_fhandle);
 891 }
 892
 893 #define MAX_IOVECS      12
 894
 895 #ifdef DEBUG
 896 static int rfs_write_sync_hits = 0;
 897 static int rfs_write_sync_misses = 0;
 898 #endif
 899
 900 /*
 901  * Write data to file.
 902  * Returns attributes of a file after writing some data to it.
 903  *
 904  * Any changes made here, especially in error handling might have
 905  * to also be done in rfs_write (which clusters write requests).
 906  */
 907 /* ARGSUSED */
 908 void
 909 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
 910     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 911 {
 912         int error;
 913         vnode_t *vp;
 914         rlim64_t rlimit;
 915         struct vattr va;
 916         struct uio uio;
 917         struct iovec iov[MAX_IOVECS];
 918         mblk_t *m;
 919         struct iovec *iovp;
 920         int iovcnt;
 921         cred_t *savecred;
 922         int in_crit = 0;
 923         caller_context_t ct;
 924
 925         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
 926         if (vp == NULL) {
 927                 ns->ns_status = NFSERR_STALE;
 928                 return;
 929         }
 930
 931         if (rdonly(ro, vp)) {
 932                 VN_RELE(vp);
 933                 ns->ns_status = NFSERR_ROFS;
 934                 return;
 935         }
 936
 937         if (vp->v_type != VREG) {
 938                 VN_RELE(vp);
 939                 ns->ns_status = NFSERR_ISDIR;
 940                 return;
 941         }
 942
 943         ct.cc_sysid = 0;
 944         ct.cc_pid = 0;
 945         ct.cc_caller_id = nfs2_srv_caller_id;
 946         ct.cc_flags = CC_DONTBLOCK;
 947
 948         va.va_mask = AT_UID|AT_MODE;
 949
 950         error = fop_getattr(vp, &va, 0, cr, &ct);
 951
 952         if (error) {
 953                 VN_RELE(vp);
 954                 ns->ns_status = puterrno(error);
 955
 956                 return;
 957         }
 958
 959         if (crgetuid(cr) != va.va_uid) {
 960                 /*
 961                  * This is a kludge to allow writes of files created
 962                  * with read only permission.  The owner of the file
 963                  * is always allowed to write it.
 964                  */
 965                 error = fop_access(vp, VWRITE, 0, cr, &ct);
 966
 967                 if (error) {
 968                         VN_RELE(vp);
 969                         ns->ns_status = puterrno(error);
 970                         return;
 971                 }
 972         }
 973
 974         /*
 975          * Can't access a mandatory lock file.  This might cause
 976          * the NFS service thread to block forever waiting for a
 977          * lock to be released that will never be released.
 978          */
 979         if (MANDLOCK(vp, va.va_mode)) {
 980                 VN_RELE(vp);
 981                 ns->ns_status = NFSERR_ACCES;
 982                 return;
 983         }
 984
 985         /*
 986          * We have to enter the critical region before calling fop_rwlock
 987          * to avoid a deadlock with ufs.
 988          */
 989         if (nbl_need_check(vp)) {
 990                 nbl_start_crit(vp, RW_READER);
 991                 in_crit = 1;
 992                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
 993                     wa->wa_count, 0, NULL)) {
 994                         error = EACCES;
 995                         goto out;
 996                 }
 997         }
 998
 999         error = fop_rwlock(vp, V_WRITELOCK_TRUE, &ct);
1000
1001         /* check if a monitor detected a delegation conflict */
1002         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1003                 VN_RELE(vp);
1004                 /* mark as wouldblock so response is dropped */
1005                 curthread->t_flag |= T_WOULDBLOCK;
1006                 return;
1007         }
1008
1009         if (wa->wa_data || wa->wa_rlist) {
1010                 /* Do the RDMA thing if necessary */
1011                 if (wa->wa_rlist) {
1012                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1013                         iov[0].iov_len = wa->wa_count;
1014                 } else  {
1015                         iov[0].iov_base = wa->wa_data;
1016                         iov[0].iov_len = wa->wa_count;
1017                 }
1018                 uio.uio_iov = iov;
1019                 uio.uio_iovcnt = 1;
1020                 uio.uio_segflg = UIO_SYSSPACE;
1021                 uio.uio_extflg = UIO_COPY_DEFAULT;
1022                 uio.uio_loffset = (offset_t)wa->wa_offset;
1023                 uio.uio_resid = wa->wa_count;
1024                 /*
1025                  * The limit is checked on the client. We
1026                  * should allow any size writes here.
1027                  */
1028                 uio.uio_llimit = curproc->p_fsz_ctl;
1029                 rlimit = uio.uio_llimit - wa->wa_offset;
1030                 if (rlimit < (rlim64_t)uio.uio_resid)
1031                         uio.uio_resid = (uint_t)rlimit;
1032
1033                 /*
1034                  * for now we assume no append mode
1035                  */
1036                 /*
1037                  * We're changing creds because VM may fault and we need
1038                  * the cred of the current thread to be used if quota
1039                  * checking is enabled.
1040                  */
1041                 savecred = curthread->t_cred;
1042                 curthread->t_cred = cr;
1043                 error = fop_write(vp, &uio, FSYNC, cr, &ct);
1044                 curthread->t_cred = savecred;
1045         } else {
1046                 iovcnt = 0;
1047                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1048                         iovcnt++;
1049                 if (iovcnt <= MAX_IOVECS) {
1050 #ifdef DEBUG
1051                         rfs_write_sync_hits++;
1052 #endif
1053                         iovp = iov;
1054                 } else {
1055 #ifdef DEBUG
1056                         rfs_write_sync_misses++;
1057 #endif
1058                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1059                 }
1060                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1061                 uio.uio_iov = iovp;
1062                 uio.uio_iovcnt = iovcnt;
1063                 uio.uio_segflg = UIO_SYSSPACE;
1064                 uio.uio_extflg = UIO_COPY_DEFAULT;
1065                 uio.uio_loffset = (offset_t)wa->wa_offset;
1066                 uio.uio_resid = wa->wa_count;
1067                 /*
1068                  * The limit is checked on the client. We
1069                  * should allow any size writes here.
1070                  */
1071                 uio.uio_llimit = curproc->p_fsz_ctl;
1072                 rlimit = uio.uio_llimit - wa->wa_offset;
1073                 if (rlimit < (rlim64_t)uio.uio_resid)
1074                         uio.uio_resid = (uint_t)rlimit;
1075
1076                 /*
1077                  * For now we assume no append mode.
1078                  */
1079                 /*
1080                  * We're changing creds because VM may fault and we need
1081                  * the cred of the current thread to be used if quota
1082                  * checking is enabled.
1083                  */
1084                 savecred = curthread->t_cred;
1085                 curthread->t_cred = cr;
1086                 error = fop_write(vp, &uio, FSYNC, cr, &ct);
1087                 curthread->t_cred = savecred;
1088
1089                 if (iovp != iov)
1090                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1091         }
1092
1093         fop_rwunlock(vp, V_WRITELOCK_TRUE, &ct);
1094
1095         if (!error) {
1096                 /*
1097                  * Get attributes again so we send the latest mod
1098                  * time to the client side for its cache.
1099                  */
1100                 va.va_mask = AT_ALL;    /* now we want everything */
1101
1102                 error = fop_getattr(vp, &va, 0, cr, &ct);
1103
1104                 /* check for overflows */
1105                 if (!error) {
1106                         acl_perm(vp, exi, &va, cr);
1107                         error = vattr_to_nattr(&va, &ns->ns_attr);
1108                 }
1109         }
1110
1111 out:
1112         if (in_crit)
1113                 nbl_end_crit(vp);
1114         VN_RELE(vp);
1115
1116         /* check if a monitor detected a delegation conflict */
1117         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1118                 /* mark as wouldblock so response is dropped */
1119                 curthread->t_flag |= T_WOULDBLOCK;
1120         else
1121                 ns->ns_status = puterrno(error);
1122
1123 }
1124
1125 struct rfs_async_write {
1126         struct nfswriteargs *wa;
1127         struct nfsattrstat *ns;
1128         struct svc_req *req;
1129         cred_t *cr;
1130         bool_t ro;
1131         kthread_t *thread;
1132         struct rfs_async_write *list;
1133 };
1134
1135 struct rfs_async_write_list {
1136         fhandle_t *fhp;
1137         kcondvar_t cv;
1138         struct rfs_async_write *list;
1139         struct rfs_async_write_list *next;
1140 };
1141
1142 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1143 static kmutex_t rfs_async_write_lock;
1144 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1145
1146 #define MAXCLIOVECS     42
1147 #define RFSWRITE_INITVAL (enum nfsstat) -1
1148
1149 #ifdef DEBUG
1150 static int rfs_write_hits = 0;
1151 static int rfs_write_misses = 0;
1152 #endif
1153
1154 /*
1155  * Write data to file.
1156  * Returns attributes of a file after writing some data to it.
1157  */
1158 void
1159 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1160     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1161 {
1162         int error;
1163         vnode_t *vp;
1164         rlim64_t rlimit;
1165         struct vattr va;
1166         struct uio uio;
1167         struct rfs_async_write_list *lp;
1168         struct rfs_async_write_list *nlp;
1169         struct rfs_async_write *rp;
1170         struct rfs_async_write *nrp;
1171         struct rfs_async_write *trp;
1172         struct rfs_async_write *lrp;
1173         int data_written;
1174         int iovcnt;
1175         mblk_t *m;
1176         struct iovec *iovp;
1177         struct iovec *niovp;
1178         struct iovec iov[MAXCLIOVECS];
1179         int count;
1180         int rcount;
1181         uint_t off;
1182         uint_t len;
1183         struct rfs_async_write nrpsp;
1184         struct rfs_async_write_list nlpsp;
1185         ushort_t t_flag;
1186         cred_t *savecred;
1187         int in_crit = 0;
1188         caller_context_t ct;
1189
1190         if (!rfs_write_async) {
1191                 rfs_write_sync(wa, ns, exi, req, cr, ro);
1192                 return;
1193         }
1194
1195         /*
1196          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1197          * is considered an OK.
1198          */
1199         ns->ns_status = RFSWRITE_INITVAL;
1200
1201         nrp = &nrpsp;
1202         nrp->wa = wa;
1203         nrp->ns = ns;
1204         nrp->req = req;
1205         nrp->cr = cr;
1206         nrp->ro = ro;
1207         nrp->thread = curthread;
1208
1209         /*
1210          * Look to see if there is already a cluster started
1211          * for this file.
1212          */
1213         mutex_enter(&rfs_async_write_lock);
1214         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1215                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1216                     sizeof (fhandle_t)) == 0)
1217                         break;
1218         }
1219
1220         /*
1221          * If lp is non-NULL, then there is already a cluster
1222          * started.  We need to place ourselves in the cluster
1223          * list in the right place as determined by starting
1224          * offset.  Conflicts with non-blocking mandatory locked
1225          * regions will be checked when the cluster is processed.
1226          */
1227         if (lp != NULL) {
1228                 rp = lp->list;
1229                 trp = NULL;
1230                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1231                         trp = rp;
1232                         rp = rp->list;
1233                 }
1234                 nrp->list = rp;
1235                 if (trp == NULL)
1236                         lp->list = nrp;
1237                 else
1238                         trp->list = nrp;
1239                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1240                         cv_wait(&lp->cv, &rfs_async_write_lock);
1241                 mutex_exit(&rfs_async_write_lock);
1242
1243                 return;
1244         }
1245
1246         /*
1247          * No cluster started yet, start one and add ourselves
1248          * to the list of clusters.
1249          */
1250         nrp->list = NULL;
1251
1252         nlp = &nlpsp;
1253         nlp->fhp = &wa->wa_fhandle;
1254         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1255         nlp->list = nrp;
1256         nlp->next = NULL;
1257
1258         if (rfs_async_write_head == NULL) {
1259                 rfs_async_write_head = nlp;
1260         } else {
1261                 lp = rfs_async_write_head;
1262                 while (lp->next != NULL)
1263                         lp = lp->next;
1264                 lp->next = nlp;
1265         }
1266         mutex_exit(&rfs_async_write_lock);
1267
1268         /*
1269          * Convert the file handle common to all of the requests
1270          * in this cluster to a vnode.
1271          */
1272         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1273         if (vp == NULL) {
1274                 mutex_enter(&rfs_async_write_lock);
1275                 if (rfs_async_write_head == nlp)
1276                         rfs_async_write_head = nlp->next;
1277                 else {
1278                         lp = rfs_async_write_head;
1279                         while (lp->next != nlp)
1280                                 lp = lp->next;
1281                         lp->next = nlp->next;
1282                 }
1283                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1284                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1285                         rp->ns->ns_status = NFSERR_STALE;
1286                         rp->thread->t_flag |= t_flag;
1287                 }
1288                 cv_broadcast(&nlp->cv);
1289                 mutex_exit(&rfs_async_write_lock);
1290
1291                 return;
1292         }
1293
1294         /*
1295          * Can only write regular files.  Attempts to write any
1296          * other file types fail with EISDIR.
1297          */
1298         if (vp->v_type != VREG) {
1299                 VN_RELE(vp);
1300                 mutex_enter(&rfs_async_write_lock);
1301                 if (rfs_async_write_head == nlp)
1302                         rfs_async_write_head = nlp->next;
1303                 else {
1304                         lp = rfs_async_write_head;
1305                         while (lp->next != nlp)
1306                                 lp = lp->next;
1307                         lp->next = nlp->next;
1308                 }
1309                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1310                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1311                         rp->ns->ns_status = NFSERR_ISDIR;
1312                         rp->thread->t_flag |= t_flag;
1313                 }
1314                 cv_broadcast(&nlp->cv);
1315                 mutex_exit(&rfs_async_write_lock);
1316
1317                 return;
1318         }
1319
1320         /*
1321          * Enter the critical region before calling fop_rwlock, to avoid a
1322          * deadlock with ufs.
1323          */
1324         if (nbl_need_check(vp)) {
1325                 nbl_start_crit(vp, RW_READER);
1326                 in_crit = 1;
1327         }
1328
1329         ct.cc_sysid = 0;
1330         ct.cc_pid = 0;
1331         ct.cc_caller_id = nfs2_srv_caller_id;
1332         ct.cc_flags = CC_DONTBLOCK;
1333
1334         /*
1335          * Lock the file for writing.  This operation provides
1336          * the delay which allows clusters to grow.
1337          */
1338         error = fop_rwlock(vp, V_WRITELOCK_TRUE, &ct);
1339
1340         /* check if a monitor detected a delegation conflict */
1341         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1342                 if (in_crit)
1343                         nbl_end_crit(vp);
1344                 VN_RELE(vp);
1345                 /* mark as wouldblock so response is dropped */
1346                 curthread->t_flag |= T_WOULDBLOCK;
1347                 mutex_enter(&rfs_async_write_lock);
1348                 if (rfs_async_write_head == nlp)
1349                         rfs_async_write_head = nlp->next;
1350                 else {
1351                         lp = rfs_async_write_head;
1352                         while (lp->next != nlp)
1353                                 lp = lp->next;
1354                         lp->next = nlp->next;
1355                 }
1356                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1357                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1358                                 rp->ns->ns_status = puterrno(error);
1359                                 rp->thread->t_flag |= T_WOULDBLOCK;
1360                         }
1361                 }
1362                 cv_broadcast(&nlp->cv);
1363                 mutex_exit(&rfs_async_write_lock);
1364
1365                 return;
1366         }
1367
1368         /*
1369          * Disconnect this cluster from the list of clusters.
1370          * The cluster that is being dealt with must be fixed
1371          * in size after this point, so there is no reason
1372          * to leave it on the list so that new requests can
1373          * find it.
1374          *
1375          * The algorithm is that the first write request will
1376          * create a cluster, convert the file handle to a
1377          * vnode pointer, and then lock the file for writing.
1378          * This request is not likely to be clustered with
1379          * any others.  However, the next request will create
1380          * a new cluster and be blocked in fop_rwlock while
1381          * the first request is being processed.  This delay
1382          * will allow more requests to be clustered in this
1383          * second cluster.
1384          */
1385         mutex_enter(&rfs_async_write_lock);
1386         if (rfs_async_write_head == nlp)
1387                 rfs_async_write_head = nlp->next;
1388         else {
1389                 lp = rfs_async_write_head;
1390                 while (lp->next != nlp)
1391                         lp = lp->next;
1392                 lp->next = nlp->next;
1393         }
1394         mutex_exit(&rfs_async_write_lock);
1395
1396         /*
1397          * Step through the list of requests in this cluster.
1398          * We need to check permissions to make sure that all
1399          * of the requests have sufficient permission to write
1400          * the file.  A cluster can be composed of requests
1401          * from different clients and different users on each
1402          * client.
1403          *
1404          * As a side effect, we also calculate the size of the
1405          * byte range that this cluster encompasses.
1406          */
1407         rp = nlp->list;
1408         off = rp->wa->wa_offset;
1409         len = 0;
1410         do {
1411                 if (rdonly(rp->ro, vp)) {
1412                         rp->ns->ns_status = NFSERR_ROFS;
1413                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1414                         rp->thread->t_flag |= t_flag;
1415                         continue;
1416                 }
1417
1418                 va.va_mask = AT_UID|AT_MODE;
1419
1420                 error = fop_getattr(vp, &va, 0, rp->cr, &ct);
1421
1422                 if (!error) {
1423                         if (crgetuid(rp->cr) != va.va_uid) {
1424                                 /*
1425                                  * This is a kludge to allow writes of files
1426                                  * created with read only permission.  The
1427                                  * owner of the file is always allowed to
1428                                  * write it.
1429                                  */
1430                                 error = fop_access(vp, VWRITE, 0, rp->cr, &ct);
1431                         }
1432                         if (!error && MANDLOCK(vp, va.va_mode))
1433                                 error = EACCES;
1434                 }
1435
1436                 /*
1437                  * Check for a conflict with a nbmand-locked region.
1438                  */
1439                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1440                     rp->wa->wa_count, 0, NULL)) {
1441                         error = EACCES;
1442                 }
1443
1444                 if (error) {
1445                         rp->ns->ns_status = puterrno(error);
1446                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1447                         rp->thread->t_flag |= t_flag;
1448                         continue;
1449                 }
1450                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1451                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1452         } while ((rp = rp->list) != NULL);
1453
1454         /*
1455          * Step through the cluster attempting to gather as many
1456          * requests which are contiguous as possible.  These
1457          * contiguous requests are handled via one call to fop_write
1458          * instead of different calls to fop_write.  We also keep
1459          * track of the fact that any data was written.
1460          */
1461         rp = nlp->list;
1462         data_written = 0;
1463         do {
1464                 /*
1465                  * Skip any requests which are already marked as having an
1466                  * error.
1467                  */
1468                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1469                         rp = rp->list;
1470                         continue;
1471                 }
1472
1473                 /*
1474                  * Count the number of iovec's which are required
1475                  * to handle this set of requests.  One iovec is
1476                  * needed for each data buffer, whether addressed
1477                  * by wa_data or by the b_rptr pointers in the
1478                  * mblk chains.
1479                  */
1480                 iovcnt = 0;
1481                 lrp = rp;
1482                 for (;;) {
1483                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1484                                 iovcnt++;
1485                         else {
1486                                 m = lrp->wa->wa_mblk;
1487                                 while (m != NULL) {
1488                                         iovcnt++;
1489                                         m = m->b_cont;
1490                                 }
1491                         }
1492                         if (lrp->list == NULL ||
1493                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1494                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1495                             lrp->list->wa->wa_offset) {
1496                                 lrp = lrp->list;
1497                                 break;
1498                         }
1499                         lrp = lrp->list;
1500                 }
1501
1502                 if (iovcnt <= MAXCLIOVECS) {
1503 #ifdef DEBUG
1504                         rfs_write_hits++;
1505 #endif
1506                         niovp = iov;
1507                 } else {
1508 #ifdef DEBUG
1509                         rfs_write_misses++;
1510 #endif
1511                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1512                 }
1513                 /*
1514                  * Put together the scatter/gather iovecs.
1515                  */
1516                 iovp = niovp;
1517                 trp = rp;
1518                 count = 0;
1519                 do {
1520                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1521                                 if (trp->wa->wa_rlist) {
1522                                         iovp->iov_base =
1523                                             (char *)((trp->wa->wa_rlist)->
1524                                             u.c_daddr3);
1525                                         iovp->iov_len = trp->wa->wa_count;
1526                                 } else  {
1527                                         iovp->iov_base = trp->wa->wa_data;
1528                                         iovp->iov_len = trp->wa->wa_count;
1529                                 }
1530                                 iovp++;
1531                         } else {
1532                                 m = trp->wa->wa_mblk;
1533                                 rcount = trp->wa->wa_count;
1534                                 while (m != NULL) {
1535                                         iovp->iov_base = (caddr_t)m->b_rptr;
1536                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1537                                         rcount -= iovp->iov_len;
1538                                         if (rcount < 0)
1539                                                 iovp->iov_len += rcount;
1540                                         iovp++;
1541                                         if (rcount <= 0)
1542                                                 break;
1543                                         m = m->b_cont;
1544                                 }
1545                         }
1546                         count += trp->wa->wa_count;
1547                         trp = trp->list;
1548                 } while (trp != lrp);
1549
1550                 uio.uio_iov = niovp;
1551                 uio.uio_iovcnt = iovcnt;
1552                 uio.uio_segflg = UIO_SYSSPACE;
1553                 uio.uio_extflg = UIO_COPY_DEFAULT;
1554                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1555                 uio.uio_resid = count;
1556                 /*
1557                  * The limit is checked on the client. We
1558                  * should allow any size writes here.
1559                  */
1560                 uio.uio_llimit = curproc->p_fsz_ctl;
1561                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1562                 if (rlimit < (rlim64_t)uio.uio_resid)
1563                         uio.uio_resid = (uint_t)rlimit;
1564
1565                 /*
1566                  * For now we assume no append mode.
1567                  */
1568
1569                 /*
1570                  * We're changing creds because VM may fault
1571                  * and we need the cred of the current
1572                  * thread to be used if quota * checking is
1573                  * enabled.
1574                  */
1575                 savecred = curthread->t_cred;
1576                 curthread->t_cred = cr;
1577                 error = fop_write(vp, &uio, 0, rp->cr, &ct);
1578                 curthread->t_cred = savecred;
1579
1580                 /* check if a monitor detected a delegation conflict */
1581                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1582                         /* mark as wouldblock so response is dropped */
1583                         curthread->t_flag |= T_WOULDBLOCK;
1584
1585                 if (niovp != iov)
1586                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1587
1588                 if (!error) {
1589                         data_written = 1;
1590                         /*
1591                          * Get attributes again so we send the latest mod
1592                          * time to the client side for its cache.
1593                          */
1594                         va.va_mask = AT_ALL;    /* now we want everything */
1595
1596                         error = fop_getattr(vp, &va, 0, rp->cr, &ct);
1597
1598                         if (!error)
1599                                 acl_perm(vp, exi, &va, rp->cr);
1600                 }
1601
1602                 /*
1603                  * Fill in the status responses for each request
1604                  * which was just handled.  Also, copy the latest
1605                  * attributes in to the attribute responses if
1606                  * appropriate.
1607                  */
1608                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1609                 do {
1610                         rp->thread->t_flag |= t_flag;
1611                         /* check for overflows */
1612                         if (!error) {
1613                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1614                         }
1615                         rp->ns->ns_status = puterrno(error);
1616                         rp = rp->list;
1617                 } while (rp != lrp);
1618         } while (rp != NULL);
1619
1620         /*
1621          * If any data was written at all, then we need to flush
1622          * the data and metadata to stable storage.
1623          */
1624         if (data_written) {
1625                 error = fop_putpage(vp, (uoff_t)off, len, 0, cr, &ct);
1626
1627                 if (!error) {
1628                         error = fop_fsync(vp, FNODSYNC, cr, &ct);
1629                 }
1630         }
1631
1632         fop_rwunlock(vp, V_WRITELOCK_TRUE, &ct);
1633
1634         if (in_crit)
1635                 nbl_end_crit(vp);
1636         VN_RELE(vp);
1637
1638         t_flag = curthread->t_flag & T_WOULDBLOCK;
1639         mutex_enter(&rfs_async_write_lock);
1640         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1641                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1642                         rp->ns->ns_status = puterrno(error);
1643                         rp->thread->t_flag |= t_flag;
1644                 }
1645         }
1646         cv_broadcast(&nlp->cv);
1647         mutex_exit(&rfs_async_write_lock);
1648
1649 }
1650
1651 void *
1652 rfs_write_getfh(struct nfswriteargs *wa)
1653 {
1654         return (&wa->wa_fhandle);
1655 }
1656
1657 /*
1658  * Create a file.
1659  * Creates a file with given attributes and returns those attributes
1660  * and an fhandle for the new file.
1661  */
1662 void
1663 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1664     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1665 {
1666         int error;
1667         int lookuperr;
1668         int in_crit = 0;
1669         struct vattr va;
1670         vnode_t *vp;
1671         vnode_t *realvp;
1672         vnode_t *dvp;
1673         char *name = args->ca_da.da_name;
1674         vnode_t *tvp = NULL;
1675         int mode;
1676         int lookup_ok;
1677         bool_t trunc;
1678         struct sockaddr *ca;
1679
1680         /*
1681          * Disallow NULL paths
1682          */
1683         if (name == NULL || *name == '\0') {
1684                 dr->dr_status = NFSERR_ACCES;
1685                 return;
1686         }
1687
1688         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1689         if (dvp == NULL) {
1690                 dr->dr_status = NFSERR_STALE;
1691                 return;
1692         }
1693
1694         error = sattr_to_vattr(args->ca_sa, &va);
1695         if (error) {
1696                 dr->dr_status = puterrno(error);
1697                 return;
1698         }
1699
1700         /*
1701          * Must specify the mode.
1702          */
1703         if (!(va.va_mask & AT_MODE)) {
1704                 VN_RELE(dvp);
1705                 dr->dr_status = NFSERR_INVAL;
1706                 return;
1707         }
1708
1709         /*
1710          * This is a completely gross hack to make mknod
1711          * work over the wire until we can wack the protocol
1712          */
1713         if ((va.va_mode & IFMT) == IFCHR) {
1714                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1715                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1716                 else {
1717                         va.va_type = VCHR;
1718                         /*
1719                          * uncompress the received dev_t
1720                          * if the top half is zero indicating a request
1721                          * from an `older style' OS.
1722                          */
1723                         if ((va.va_size & 0xffff0000) == 0)
1724                                 va.va_rdev = nfsv2_expdev(va.va_size);
1725                         else
1726                                 va.va_rdev = (dev_t)va.va_size;
1727                 }
1728                 va.va_mask &= ~AT_SIZE;
1729         } else if ((va.va_mode & IFMT) == IFBLK) {
1730                 va.va_type = VBLK;
1731                 /*
1732                  * uncompress the received dev_t
1733                  * if the top half is zero indicating a request
1734                  * from an `older style' OS.
1735                  */
1736                 if ((va.va_size & 0xffff0000) == 0)
1737                         va.va_rdev = nfsv2_expdev(va.va_size);
1738                 else
1739                         va.va_rdev = (dev_t)va.va_size;
1740                 va.va_mask &= ~AT_SIZE;
1741         } else if ((va.va_mode & IFMT) == IFSOCK) {
1742                 va.va_type = VSOCK;
1743         } else {
1744                 va.va_type = VREG;
1745         }
1746         va.va_mode &= ~IFMT;
1747         va.va_mask |= AT_TYPE;
1748
1749         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1750         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1751             MAXPATHLEN);
1752         if (name == NULL) {
1753                 dr->dr_status = puterrno(EINVAL);
1754                 return;
1755         }
1756
1757         /*
1758          * Why was the choice made to use VWRITE as the mode to the
1759          * call to fop_create ? This results in a bug.  When a client
1760          * opens a file that already exists and is RDONLY, the second
1761          * open fails with an EACESS because of the mode.
1762          * bug ID 1054648.
1763          */
1764         lookup_ok = 0;
1765         mode = VWRITE;
1766         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1767                 error = fop_lookup(dvp, name, &tvp, NULL, 0, NULL, cr,
1768                     NULL, NULL, NULL);
1769                 if (!error) {
1770                         struct vattr at;
1771
1772                         lookup_ok = 1;
1773                         at.va_mask = AT_MODE;
1774                         error = fop_getattr(tvp, &at, 0, cr, NULL);
1775                         if (!error)
1776                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1777                         VN_RELE(tvp);
1778                         tvp = NULL;
1779                 }
1780         }
1781
1782         if (!lookup_ok) {
1783                 if (rdonly(ro, dvp)) {
1784                         error = EROFS;
1785                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1786                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1787                         error = EPERM;
1788                 } else {
1789                         error = 0;
1790                 }
1791         }
1792
1793         /*
1794          * If file size is being modified on an already existing file
1795          * make sure that there are no conflicting non-blocking mandatory
1796          * locks in the region being manipulated. Return EACCES if there
1797          * are conflicting locks.
1798          */
1799         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1800                 lookuperr = fop_lookup(dvp, name, &tvp, NULL, 0, NULL, cr,
1801                     NULL, NULL, NULL);
1802
1803                 if (!lookuperr &&
1804                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1805                         VN_RELE(tvp);
1806                         curthread->t_flag |= T_WOULDBLOCK;
1807                         goto out;
1808                 }
1809
1810                 if (!lookuperr && nbl_need_check(tvp)) {
1811                         /*
1812                          * The file exists. Now check if it has any
1813                          * conflicting non-blocking mandatory locks
1814                          * in the region being changed.
1815                          */
1816                         struct vattr bva;
1817                         uoff_t offset;
1818                         ssize_t length;
1819
1820                         nbl_start_crit(tvp, RW_READER);
1821                         in_crit = 1;
1822
1823                         bva.va_mask = AT_SIZE;
1824                         error = fop_getattr(tvp, &bva, 0, cr, NULL);
1825                         if (!error) {
1826                                 if (va.va_size < bva.va_size) {
1827                                         offset = va.va_size;
1828                                         length = bva.va_size - va.va_size;
1829                                 } else {
1830                                         offset = bva.va_size;
1831                                         length = va.va_size - bva.va_size;
1832                                 }
1833                                 if (length) {
1834                                         if (nbl_conflict(tvp, NBL_WRITE,
1835                                             offset, length, 0, NULL)) {
1836                                                 error = EACCES;
1837                                         }
1838                                 }
1839                         }
1840                         if (error) {
1841                                 nbl_end_crit(tvp);
1842                                 VN_RELE(tvp);
1843                                 in_crit = 0;
1844                         }
1845                 } else if (tvp != NULL) {
1846                         VN_RELE(tvp);
1847                 }
1848         }
1849
1850         if (!error) {
1851                 /*
1852                  * If filesystem is shared with nosuid the remove any
1853                  * setuid/setgid bits on create.
1854                  */
1855                 if (va.va_type == VREG &&
1856                     exi->exi_export.ex_flags & EX_NOSUID)
1857                         va.va_mode &= ~(VSUID | VSGID);
1858
1859                 error = fop_create(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1860                     NULL, NULL);
1861
1862                 if (!error) {
1863
1864                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1865                                 trunc = TRUE;
1866                         else
1867                                 trunc = FALSE;
1868
1869                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1870                                 VN_RELE(vp);
1871                                 curthread->t_flag |= T_WOULDBLOCK;
1872                                 goto out;
1873                         }
1874                         va.va_mask = AT_ALL;
1875
1876                         error = fop_getattr(vp, &va, 0, cr, NULL);
1877
1878                         /* check for overflows */
1879                         if (!error) {
1880                                 acl_perm(vp, exi, &va, cr);
1881                                 error = vattr_to_nattr(&va, &dr->dr_attr);
1882                                 if (!error) {
1883                                         error = makefh(&dr->dr_fhandle, vp,
1884                                             exi);
1885                                 }
1886                         }
1887                         /*
1888                          * Force modified metadata out to stable storage.
1889                          *
1890                          * if a underlying vp exists, pass it to fop_fsync
1891                          */
1892                         if (fop_realvp(vp, &realvp, NULL) == 0)
1893                                 (void) fop_fsync(realvp, FNODSYNC, cr, NULL);
1894                         else
1895                                 (void) fop_fsync(vp, FNODSYNC, cr, NULL);
1896                         VN_RELE(vp);
1897                 }
1898
1899                 if (in_crit) {
1900                         nbl_end_crit(tvp);
1901                         VN_RELE(tvp);
1902                 }
1903         }
1904
1905         /*
1906          * Force modified data and metadata out to stable storage.
1907          */
1908         (void) fop_fsync(dvp, 0, cr, NULL);
1909
1910 out:
1911
1912         VN_RELE(dvp);
1913
1914         dr->dr_status = puterrno(error);
1915
1916         if (name != args->ca_da.da_name)
1917                 kmem_free(name, MAXPATHLEN);
1918 }
1919 void *
1920 rfs_create_getfh(struct nfscreatargs *args)
1921 {
1922         return (args->ca_da.da_fhandle);
1923 }
1924
1925 /*
1926  * Remove a file.
1927  * Remove named file from parent directory.
1928  */
1929 /* ARGSUSED */
1930 void
1931 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1932     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1933 {
1934         int error = 0;
1935         vnode_t *vp;
1936         vnode_t *targvp;
1937         int in_crit = 0;
1938
1939         /*
1940          * Disallow NULL paths
1941          */
1942         if (da->da_name == NULL || *da->da_name == '\0') {
1943                 *status = NFSERR_ACCES;
1944                 return;
1945         }
1946
1947         vp = nfs_fhtovp(da->da_fhandle, exi);
1948         if (vp == NULL) {
1949                 *status = NFSERR_STALE;
1950                 return;
1951         }
1952
1953         if (rdonly(ro, vp)) {
1954                 VN_RELE(vp);
1955                 *status = NFSERR_ROFS;
1956                 return;
1957         }
1958
1959         /*
1960          * Check for a conflict with a non-blocking mandatory share reservation.
1961          */
1962         error = fop_lookup(vp, da->da_name, &targvp, NULL, 0,
1963             NULL, cr, NULL, NULL, NULL);
1964         if (error != 0) {
1965                 VN_RELE(vp);
1966                 *status = puterrno(error);
1967                 return;
1968         }
1969
1970         /*
1971          * If the file is delegated to an v4 client, then initiate
1972          * recall and drop this request (by setting T_WOULDBLOCK).
1973          * The client will eventually re-transmit the request and
1974          * (hopefully), by then, the v4 client will have returned
1975          * the delegation.
1976          */
1977
1978         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1979                 VN_RELE(vp);
1980                 VN_RELE(targvp);
1981                 curthread->t_flag |= T_WOULDBLOCK;
1982                 return;
1983         }
1984
1985         if (nbl_need_check(targvp)) {
1986                 nbl_start_crit(targvp, RW_READER);
1987                 in_crit = 1;
1988                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
1989                         error = EACCES;
1990                         goto out;
1991                 }
1992         }
1993
1994         error = fop_remove(vp, da->da_name, cr, NULL, 0);
1995
1996         /*
1997          * Force modified data and metadata out to stable storage.
1998          */
1999         (void) fop_fsync(vp, 0, cr, NULL);
2000
2001 out:
2002         if (in_crit)
2003                 nbl_end_crit(targvp);
2004         VN_RELE(targvp);
2005         VN_RELE(vp);
2006
2007         *status = puterrno(error);
2008
2009 }
2010
2011 void *
2012 rfs_remove_getfh(struct nfsdiropargs *da)
2013 {
2014         return (da->da_fhandle);
2015 }
2016
2017 /*
2018  * rename a file
2019  * Give a file (from) a new name (to).
2020  */
2021 /* ARGSUSED */
2022 void
2023 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2024     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2025 {
2026         int error = 0;
2027         vnode_t *fromvp;
2028         vnode_t *tovp;
2029         struct exportinfo *to_exi;
2030         fhandle_t *fh;
2031         vnode_t *srcvp;
2032         vnode_t *targvp;
2033         int in_crit = 0;
2034
2035         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2036         if (fromvp == NULL) {
2037                 *status = NFSERR_STALE;
2038                 return;
2039         }
2040
2041         fh = args->rna_to.da_fhandle;
2042         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2043         if (to_exi == NULL) {
2044                 VN_RELE(fromvp);
2045                 *status = NFSERR_ACCES;
2046                 return;
2047         }
2048         exi_rele(to_exi);
2049
2050         if (to_exi != exi) {
2051                 VN_RELE(fromvp);
2052                 *status = NFSERR_XDEV;
2053                 return;
2054         }
2055
2056         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2057         if (tovp == NULL) {
2058                 VN_RELE(fromvp);
2059                 *status = NFSERR_STALE;
2060                 return;
2061         }
2062
2063         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2064                 VN_RELE(tovp);
2065                 VN_RELE(fromvp);
2066                 *status = NFSERR_NOTDIR;
2067                 return;
2068         }
2069
2070         /*
2071          * Disallow NULL paths
2072          */
2073         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2074             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2075                 VN_RELE(tovp);
2076                 VN_RELE(fromvp);
2077                 *status = NFSERR_ACCES;
2078                 return;
2079         }
2080
2081         if (rdonly(ro, tovp)) {
2082                 VN_RELE(tovp);
2083                 VN_RELE(fromvp);
2084                 *status = NFSERR_ROFS;
2085                 return;
2086         }
2087
2088         /*
2089          * Check for a conflict with a non-blocking mandatory share reservation.
2090          */
2091         error = fop_lookup(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2092             NULL, cr, NULL, NULL, NULL);
2093         if (error != 0) {
2094                 VN_RELE(tovp);
2095                 VN_RELE(fromvp);
2096                 *status = puterrno(error);
2097                 return;
2098         }
2099
2100         /* Check for delegations on the source file */
2101
2102         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2103                 VN_RELE(tovp);
2104                 VN_RELE(fromvp);
2105                 VN_RELE(srcvp);
2106                 curthread->t_flag |= T_WOULDBLOCK;
2107                 return;
2108         }
2109
2110         /* Check for delegation on the file being renamed over, if it exists */
2111
2112         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2113             fop_lookup(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2114             NULL, NULL, NULL) == 0) {
2115
2116                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2117                         VN_RELE(tovp);
2118                         VN_RELE(fromvp);
2119                         VN_RELE(srcvp);
2120                         VN_RELE(targvp);
2121                         curthread->t_flag |= T_WOULDBLOCK;
2122                         return;
2123                 }
2124                 VN_RELE(targvp);
2125         }
2126
2127
2128         if (nbl_need_check(srcvp)) {
2129                 nbl_start_crit(srcvp, RW_READER);
2130                 in_crit = 1;
2131                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2132                         error = EACCES;
2133                         goto out;
2134                 }
2135         }
2136
2137         error = fop_rename(fromvp, args->rna_from.da_name,
2138             tovp, args->rna_to.da_name, cr, NULL, 0);
2139
2140         if (error == 0)
2141                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2142                     strlen(args->rna_to.da_name));
2143
2144         /*
2145          * Force modified data and metadata out to stable storage.
2146          */
2147         (void) fop_fsync(tovp, 0, cr, NULL);
2148         (void) fop_fsync(fromvp, 0, cr, NULL);
2149
2150 out:
2151         if (in_crit)
2152                 nbl_end_crit(srcvp);
2153         VN_RELE(srcvp);
2154         VN_RELE(tovp);
2155         VN_RELE(fromvp);
2156
2157         *status = puterrno(error);
2158
2159 }
2160 void *
2161 rfs_rename_getfh(struct nfsrnmargs *args)
2162 {
2163         return (args->rna_from.da_fhandle);
2164 }
2165
2166 /*
2167  * Link to a file.
2168  * Create a file (to) which is a hard link to the given file (from).
2169  */
2170 /* ARGSUSED */
2171 void
2172 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2173     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2174 {
2175         int error;
2176         vnode_t *fromvp;
2177         vnode_t *tovp;
2178         struct exportinfo *to_exi;
2179         fhandle_t *fh;
2180
2181         fromvp = nfs_fhtovp(args->la_from, exi);
2182         if (fromvp == NULL) {
2183                 *status = NFSERR_STALE;
2184                 return;
2185         }
2186
2187         fh = args->la_to.da_fhandle;
2188         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2189         if (to_exi == NULL) {
2190                 VN_RELE(fromvp);
2191                 *status = NFSERR_ACCES;
2192                 return;
2193         }
2194         exi_rele(to_exi);
2195
2196         if (to_exi != exi) {
2197                 VN_RELE(fromvp);
2198                 *status = NFSERR_XDEV;
2199                 return;
2200         }
2201
2202         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2203         if (tovp == NULL) {
2204                 VN_RELE(fromvp);
2205                 *status = NFSERR_STALE;
2206                 return;
2207         }
2208
2209         if (tovp->v_type != VDIR) {
2210                 VN_RELE(tovp);
2211                 VN_RELE(fromvp);
2212                 *status = NFSERR_NOTDIR;
2213                 return;
2214         }
2215         /*
2216          * Disallow NULL paths
2217          */
2218         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2219                 VN_RELE(tovp);
2220                 VN_RELE(fromvp);
2221                 *status = NFSERR_ACCES;
2222                 return;
2223         }
2224
2225         if (rdonly(ro, tovp)) {
2226                 VN_RELE(tovp);
2227                 VN_RELE(fromvp);
2228                 *status = NFSERR_ROFS;
2229                 return;
2230         }
2231
2232         error = fop_link(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2233
2234         /*
2235          * Force modified data and metadata out to stable storage.
2236          */
2237         (void) fop_fsync(tovp, 0, cr, NULL);
2238         (void) fop_fsync(fromvp, FNODSYNC, cr, NULL);
2239
2240         VN_RELE(tovp);
2241         VN_RELE(fromvp);
2242
2243         *status = puterrno(error);
2244
2245 }
2246 void *
2247 rfs_link_getfh(struct nfslinkargs *args)
2248 {
2249         return (args->la_from);
2250 }
2251
2252 /*
2253  * Symbolicly link to a file.
2254  * Create a file (to) with the given attributes which is a symbolic link
2255  * to the given path name (to).
2256  */
2257 void
2258 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2259     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2260 {
2261         int error;
2262         struct vattr va;
2263         vnode_t *vp;
2264         vnode_t *svp;
2265         int lerror;
2266         struct sockaddr *ca;
2267         char *name = NULL;
2268
2269         /*
2270          * Disallow NULL paths
2271          */
2272         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2273                 *status = NFSERR_ACCES;
2274                 return;
2275         }
2276
2277         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2278         if (vp == NULL) {
2279                 *status = NFSERR_STALE;
2280                 return;
2281         }
2282
2283         if (rdonly(ro, vp)) {
2284                 VN_RELE(vp);
2285                 *status = NFSERR_ROFS;
2286                 return;
2287         }
2288
2289         error = sattr_to_vattr(args->sla_sa, &va);
2290         if (error) {
2291                 VN_RELE(vp);
2292                 *status = puterrno(error);
2293                 return;
2294         }
2295
2296         if (!(va.va_mask & AT_MODE)) {
2297                 VN_RELE(vp);
2298                 *status = NFSERR_INVAL;
2299                 return;
2300         }
2301
2302         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2303         name = nfscmd_convname(ca, exi, args->sla_tnm,
2304             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2305
2306         if (name == NULL) {
2307                 *status = NFSERR_ACCES;
2308                 return;
2309         }
2310
2311         va.va_type = VLNK;
2312         va.va_mask |= AT_TYPE;
2313
2314         error = fop_symlink(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2315
2316         /*
2317          * Force new data and metadata out to stable storage.
2318          */
2319         lerror = fop_lookup(vp, args->sla_from.da_name, &svp, NULL, 0,
2320             NULL, cr, NULL, NULL, NULL);
2321
2322         if (!lerror) {
2323                 (void) fop_fsync(svp, 0, cr, NULL);
2324                 VN_RELE(svp);
2325         }
2326
2327         /*
2328          * Force modified data and metadata out to stable storage.
2329          */
2330         (void) fop_fsync(vp, 0, cr, NULL);
2331
2332         VN_RELE(vp);
2333
2334         *status = puterrno(error);
2335         if (name != args->sla_tnm)
2336                 kmem_free(name, MAXPATHLEN);
2337
2338 }
2339 void *
2340 rfs_symlink_getfh(struct nfsslargs *args)
2341 {
2342         return (args->sla_from.da_fhandle);
2343 }
2344
2345 /*
2346  * Make a directory.
2347  * Create a directory with the given name, parent directory, and attributes.
2348  * Returns a file handle and attributes for the new directory.
2349  */
2350 /* ARGSUSED */
2351 void
2352 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2353     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2354 {
2355         int error;
2356         struct vattr va;
2357         vnode_t *dvp = NULL;
2358         vnode_t *vp;
2359         char *name = args->ca_da.da_name;
2360
2361         /*
2362          * Disallow NULL paths
2363          */
2364         if (name == NULL || *name == '\0') {
2365                 dr->dr_status = NFSERR_ACCES;
2366                 return;
2367         }
2368
2369         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2370         if (vp == NULL) {
2371                 dr->dr_status = NFSERR_STALE;
2372                 return;
2373         }
2374
2375         if (rdonly(ro, vp)) {
2376                 VN_RELE(vp);
2377                 dr->dr_status = NFSERR_ROFS;
2378                 return;
2379         }
2380
2381         error = sattr_to_vattr(args->ca_sa, &va);
2382         if (error) {
2383                 VN_RELE(vp);
2384                 dr->dr_status = puterrno(error);
2385                 return;
2386         }
2387
2388         if (!(va.va_mask & AT_MODE)) {
2389                 VN_RELE(vp);
2390                 dr->dr_status = NFSERR_INVAL;
2391                 return;
2392         }
2393
2394         va.va_type = VDIR;
2395         va.va_mask |= AT_TYPE;
2396
2397         error = fop_mkdir(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2398
2399         if (!error) {
2400                 /*
2401                  * Attribtutes of the newly created directory should
2402                  * be returned to the client.
2403                  */
2404                 va.va_mask = AT_ALL; /* We want everything */
2405                 error = fop_getattr(dvp, &va, 0, cr, NULL);
2406
2407                 /* check for overflows */
2408                 if (!error) {
2409                         acl_perm(vp, exi, &va, cr);
2410                         error = vattr_to_nattr(&va, &dr->dr_attr);
2411                         if (!error) {
2412                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2413                         }
2414                 }
2415                 /*
2416                  * Force new data and metadata out to stable storage.
2417                  */
2418                 (void) fop_fsync(dvp, 0, cr, NULL);
2419                 VN_RELE(dvp);
2420         }
2421
2422         /*
2423          * Force modified data and metadata out to stable storage.
2424          */
2425         (void) fop_fsync(vp, 0, cr, NULL);
2426
2427         VN_RELE(vp);
2428
2429         dr->dr_status = puterrno(error);
2430
2431 }
2432 void *
2433 rfs_mkdir_getfh(struct nfscreatargs *args)
2434 {
2435         return (args->ca_da.da_fhandle);
2436 }
2437
2438 /*
2439  * Remove a directory.
2440  * Remove the given directory name from the given parent directory.
2441  */
2442 /* ARGSUSED */
2443 void
2444 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2445     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2446 {
2447         int error;
2448         vnode_t *vp;
2449
2450         /*
2451          * Disallow NULL paths
2452          */
2453         if (da->da_name == NULL || *da->da_name == '\0') {
2454                 *status = NFSERR_ACCES;
2455                 return;
2456         }
2457
2458         vp = nfs_fhtovp(da->da_fhandle, exi);
2459         if (vp == NULL) {
2460                 *status = NFSERR_STALE;
2461                 return;
2462         }
2463
2464         if (rdonly(ro, vp)) {
2465                 VN_RELE(vp);
2466                 *status = NFSERR_ROFS;
2467                 return;
2468         }
2469
2470         /*
2471          * fop_rmdir takes a third argument (the current
2472          * directory of the process).  That's because someone
2473          * wants to return EINVAL if one tries to remove ".".
2474          * Of course, NFS servers have no idea what their
2475          * clients' current directories are.  We fake it by
2476          * supplying a vnode known to exist and illegal to
2477          * remove.
2478          */
2479         error = fop_rmdir(vp, da->da_name, rootdir, cr, NULL, 0);
2480
2481         /*
2482          * Force modified data and metadata out to stable storage.
2483          */
2484         (void) fop_fsync(vp, 0, cr, NULL);
2485
2486         VN_RELE(vp);
2487
2488         /*
2489          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2490          * if the directory is not empty.  A System V NFS server
2491          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2492          * over the wire.
2493          */
2494         if (error == EEXIST)
2495                 *status = NFSERR_NOTEMPTY;
2496         else
2497                 *status = puterrno(error);
2498
2499 }
2500 void *
2501 rfs_rmdir_getfh(struct nfsdiropargs *da)
2502 {
2503         return (da->da_fhandle);
2504 }
2505
2506 /* ARGSUSED */
2507 void
2508 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2509     struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2510 {
2511         int error;
2512         int iseof;
2513         struct iovec iov;
2514         struct uio uio;
2515         vnode_t *vp;
2516         char *ndata = NULL;
2517         struct sockaddr *ca;
2518         size_t nents;
2519         int ret;
2520
2521         vp = nfs_fhtovp(&rda->rda_fh, exi);
2522         if (vp == NULL) {
2523                 rd->rd_entries = NULL;
2524                 rd->rd_status = NFSERR_STALE;
2525                 return;
2526         }
2527
2528         if (vp->v_type != VDIR) {
2529                 VN_RELE(vp);
2530                 rd->rd_entries = NULL;
2531                 rd->rd_status = NFSERR_NOTDIR;
2532                 return;
2533         }
2534
2535         (void) fop_rwlock(vp, V_WRITELOCK_FALSE, NULL);
2536
2537         error = fop_access(vp, VREAD, 0, cr, NULL);
2538
2539         if (error) {
2540                 rd->rd_entries = NULL;
2541                 goto bad;
2542         }
2543
2544         if (rda->rda_count == 0) {
2545                 rd->rd_entries = NULL;
2546                 rd->rd_size = 0;
2547                 rd->rd_eof = FALSE;
2548                 goto bad;
2549         }
2550
2551         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2552
2553         /*
2554          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2555          */
2556         rd->rd_bufsize = (uint_t)rda->rda_count;
2557         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2558
2559         /*
2560          * Set up io vector to read directory data
2561          */
2562         iov.iov_base = (caddr_t)rd->rd_entries;
2563         iov.iov_len = rda->rda_count;
2564         uio.uio_iov = &iov;
2565         uio.uio_iovcnt = 1;
2566         uio.uio_segflg = UIO_SYSSPACE;
2567         uio.uio_extflg = UIO_COPY_CACHED;
2568         uio.uio_loffset = (offset_t)rda->rda_offset;
2569         uio.uio_resid = rda->rda_count;
2570
2571         /*
2572          * read directory
2573          */
2574         error = fop_readdir(vp, &uio, cr, &iseof, NULL, 0);
2575
2576         /*
2577          * Clean up
2578          */
2579         if (!error) {
2580                 /*
2581                  * set size and eof
2582                  */
2583                 if (uio.uio_resid == rda->rda_count) {
2584                         rd->rd_size = 0;
2585                         rd->rd_eof = TRUE;
2586                 } else {
2587                         rd->rd_size = (uint32_t)(rda->rda_count -
2588                             uio.uio_resid);
2589                         rd->rd_eof = iseof ? TRUE : FALSE;
2590                 }
2591         }
2592
2593         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2594         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2595         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2596             rda->rda_count, &ndata);
2597
2598         if (ret != 0) {
2599                 size_t dropbytes;
2600                 /*
2601                  * We had to drop one or more entries in order to fit
2602                  * during the character conversion.  We need to patch
2603                  * up the size and eof info.
2604                  */
2605                 if (rd->rd_eof)
2606                         rd->rd_eof = FALSE;
2607                 dropbytes = nfscmd_dropped_entrysize(
2608                     (struct dirent64 *)rd->rd_entries, nents, ret);
2609                 rd->rd_size -= dropbytes;
2610         }
2611         if (ndata == NULL) {
2612                 ndata = (char *)rd->rd_entries;
2613         } else if (ndata != (char *)rd->rd_entries) {
2614                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2615                 rd->rd_entries = (void *)ndata;
2616                 rd->rd_bufsize = rda->rda_count;
2617         }
2618
2619 bad:
2620         fop_rwunlock(vp, V_WRITELOCK_FALSE, NULL);
2621
2622 #if 0 /* notyet */
2623         /*
2624          * Don't do this.  It causes local disk writes when just
2625          * reading the file and the overhead is deemed larger
2626          * than the benefit.
2627          */
2628         /*
2629          * Force modified metadata out to stable storage.
2630          */
2631         (void) fop_fsync(vp, FNODSYNC, cr, NULL);
2632 #endif
2633
2634         VN_RELE(vp);
2635
2636         rd->rd_status = puterrno(error);
2637
2638 }
2639 void *
2640 rfs_readdir_getfh(struct nfsrddirargs *rda)
2641 {
2642         return (&rda->rda_fh);
2643 }
2644 void
2645 rfs_rddirfree(struct nfsrddirres *rd)
2646 {
2647         if (rd->rd_entries != NULL)
2648                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2649 }
2650
2651 /* ARGSUSED */
2652 void
2653 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2654     struct svc_req *req, cred_t *cr, bool_t ro)
2655 {
2656         int error;
2657         struct statvfs64 sb;
2658         vnode_t *vp;
2659
2660         vp = nfs_fhtovp(fh, exi);
2661         if (vp == NULL) {
2662                 fs->fs_status = NFSERR_STALE;
2663                 return;
2664         }
2665
2666         error = VFS_STATVFS(vp->v_vfsp, &sb);
2667
2668         if (!error) {
2669                 fs->fs_tsize = nfstsize();
2670                 fs->fs_bsize = sb.f_frsize;
2671                 fs->fs_blocks = sb.f_blocks;
2672                 fs->fs_bfree = sb.f_bfree;
2673                 fs->fs_bavail = sb.f_bavail;
2674         }
2675
2676         VN_RELE(vp);
2677
2678         fs->fs_status = puterrno(error);
2679
2680 }
2681 void *
2682 rfs_statfs_getfh(fhandle_t *fh)
2683 {
2684         return (fh);
2685 }
2686
2687 static int
2688 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2689 {
2690         vap->va_mask = 0;
2691
2692         /*
2693          * There was a sign extension bug in some VFS based systems
2694          * which stored the mode as a short.  When it would get
2695          * assigned to a u_long, no sign extension would occur.
2696          * It needed to, but this wasn't noticed because sa_mode
2697          * would then get assigned back to the short, thus ignoring
2698          * the upper 16 bits of sa_mode.
2699          *
2700          * To make this implementation work for both broken
2701          * clients and good clients, we check for both versions
2702          * of the mode.
2703          */
2704         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2705             sa->sa_mode != (uint32_t)-1) {
2706                 vap->va_mask |= AT_MODE;
2707                 vap->va_mode = sa->sa_mode;
2708         }
2709         if (sa->sa_uid != (uint32_t)-1) {
2710                 vap->va_mask |= AT_UID;
2711                 vap->va_uid = sa->sa_uid;
2712         }
2713         if (sa->sa_gid != (uint32_t)-1) {
2714                 vap->va_mask |= AT_GID;
2715                 vap->va_gid = sa->sa_gid;
2716         }
2717         if (sa->sa_size != (uint32_t)-1) {
2718                 vap->va_mask |= AT_SIZE;
2719                 vap->va_size = sa->sa_size;
2720         }
2721         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2722             sa->sa_atime.tv_usec != (int32_t)-1) {
2723 #ifndef _LP64
2724                 /* return error if time overflow */
2725                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2726                         return (EOVERFLOW);
2727 #endif
2728                 vap->va_mask |= AT_ATIME;
2729                 /*
2730                  * nfs protocol defines times as unsigned so don't extend sign,
2731                  * unless sysadmin set nfs_allow_preepoch_time.
2732                  */
2733                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2734                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2735         }
2736         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2737             sa->sa_mtime.tv_usec != (int32_t)-1) {
2738 #ifndef _LP64
2739                 /* return error if time overflow */
2740                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2741                         return (EOVERFLOW);
2742 #endif
2743                 vap->va_mask |= AT_MTIME;
2744                 /*
2745                  * nfs protocol defines times as unsigned so don't extend sign,
2746                  * unless sysadmin set nfs_allow_preepoch_time.
2747                  */
2748                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2749                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2750         }
2751         return (0);
2752 }
2753
2754 static enum nfsftype vt_to_nf[] = {
2755         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2756 };
2757
2758 /*
2759  * check the following fields for overflow: nodeid, size, and time.
2760  * There could be a problem when converting 64-bit LP64 fields
2761  * into 32-bit ones.  Return an error if there is an overflow.
2762  */
2763 int
2764 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2765 {
2766         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2767         na->na_type = vt_to_nf[vap->va_type];
2768
2769         if (vap->va_mode == (unsigned short) -1)
2770                 na->na_mode = (uint32_t)-1;
2771         else
2772                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2773
2774         if (vap->va_uid == (unsigned short)(-1))
2775                 na->na_uid = (uint32_t)(-1);
2776         else if (vap->va_uid == UID_NOBODY)
2777                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2778         else
2779                 na->na_uid = vap->va_uid;
2780
2781         if (vap->va_gid == (unsigned short)(-1))
2782                 na->na_gid = (uint32_t)-1;
2783         else if (vap->va_gid == GID_NOBODY)
2784                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2785         else
2786                 na->na_gid = vap->va_gid;
2787
2788         /*
2789          * Do we need to check fsid for overflow?  It is 64-bit in the
2790          * vattr, but are bigger than 32 bit values supported?
2791          */
2792         na->na_fsid = vap->va_fsid;
2793
2794         na->na_nodeid = vap->va_nodeid;
2795
2796         /*
2797          * Check to make sure that the nodeid is representable over the
2798          * wire without losing bits.
2799          */
2800         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2801                 return (EFBIG);
2802         na->na_nlink = vap->va_nlink;
2803
2804         /*
2805          * Check for big files here, instead of at the caller.  See
2806          * comments in cstat for large special file explanation.
2807          */
2808         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2809                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2810                         return (EFBIG);
2811                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2812                         /* UNKNOWN_SIZE | OVERFLOW */
2813                         na->na_size = MAXOFF32_T;
2814                 } else
2815                         na->na_size = vap->va_size;
2816         } else
2817                 na->na_size = vap->va_size;
2818
2819         /*
2820          * If the vnode times overflow the 32-bit times that NFS2
2821          * uses on the wire then return an error.
2822          */
2823         if (!NFS_VAP_TIME_OK(vap)) {
2824                 return (EOVERFLOW);
2825         }
2826         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2827         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2828
2829         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2830         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2831
2832         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2833         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2834
2835         /*
2836          * If the dev_t will fit into 16 bits then compress
2837          * it, otherwise leave it alone. See comments in
2838          * nfs_client.c.
2839          */
2840         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2841             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2842                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2843         else
2844                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2845
2846         na->na_blocks = vap->va_nblocks;
2847         na->na_blocksize = vap->va_blksize;
2848
2849         /*
2850          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2851          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2852          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2853          *
2854          * BUYER BEWARE:
2855          *  If you are porting the NFS to a non-Sun server, you probably
2856          *  don't want to include the following block of code.  The
2857          *  over-the-wire special file types will be changing with the
2858          *  NFS Protocol Revision.
2859          */
2860         if (vap->va_type == VFIFO)
2861                 NA_SETFIFO(na);
2862         return (0);
2863 }
2864
2865 /*
2866  * acl v2 support: returns approximate permission.
2867  *      default: returns minimal permission (more restrictive)
2868  *      aclok: returns maximal permission (less restrictive)
2869  *      This routine changes the permissions that are alaredy in *va.
2870  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2871  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
2872  */
2873 static void
2874 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2875 {
2876         vsecattr_t      vsa;
2877         int             aclcnt;
2878         aclent_t        *aclentp;
2879         mode_t          mask_perm;
2880         mode_t          grp_perm;
2881         mode_t          other_perm;
2882         mode_t          other_orig;
2883         int             error;
2884
2885         /* dont care default acl */
2886         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2887         error = fop_getsecattr(vp, &vsa, 0, cr, NULL);
2888
2889         if (!error) {
2890                 aclcnt = vsa.vsa_aclcnt;
2891                 if (aclcnt > MIN_ACL_ENTRIES) {
2892                         /* non-trivial ACL */
2893                         aclentp = vsa.vsa_aclentp;
2894                         if (exi->exi_export.ex_flags & EX_ACLOK) {
2895                                 /* maximal permissions */
2896                                 grp_perm = 0;
2897                                 other_perm = 0;
2898                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2899                                         switch (aclentp->a_type) {
2900                                         case USER_OBJ:
2901                                                 break;
2902                                         case USER:
2903                                                 grp_perm |=
2904                                                     aclentp->a_perm << 3;
2905                                                 other_perm |= aclentp->a_perm;
2906                                                 break;
2907                                         case GROUP_OBJ:
2908                                                 grp_perm |=
2909                                                     aclentp->a_perm << 3;
2910                                                 break;
2911                                         case GROUP:
2912                                                 other_perm |= aclentp->a_perm;
2913                                                 break;
2914                                         case OTHER_OBJ:
2915                                                 other_orig = aclentp->a_perm;
2916                                                 break;
2917                                         case CLASS_OBJ:
2918                                                 mask_perm = aclentp->a_perm;
2919                                                 break;
2920                                         default:
2921                                                 break;
2922                                         }
2923                                 }
2924                                 grp_perm &= mask_perm << 3;
2925                                 other_perm &= mask_perm;
2926                                 other_perm |= other_orig;
2927
2928                         } else {
2929                                 /* minimal permissions */
2930                                 grp_perm = 070;
2931                                 other_perm = 07;
2932                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2933                                         switch (aclentp->a_type) {
2934                                         case USER_OBJ:
2935                                                 break;
2936                                         case USER:
2937                                         case CLASS_OBJ:
2938                                                 grp_perm &=
2939                                                     aclentp->a_perm << 3;
2940                                                 other_perm &=
2941                                                     aclentp->a_perm;
2942                                                 break;
2943                                         case GROUP_OBJ:
2944                                                 grp_perm &=
2945                                                     aclentp->a_perm << 3;
2946                                                 break;
2947                                         case GROUP:
2948                                                 other_perm &=
2949                                                     aclentp->a_perm;
2950                                                 break;
2951                                         case OTHER_OBJ:
2952                                                 other_perm &=
2953                                                     aclentp->a_perm;
2954                                                 break;
2955                                         default:
2956                                                 break;
2957                                         }
2958                                 }
2959                         }
2960                         /* copy to va */
2961                         va->va_mode &= ~077;
2962                         va->va_mode |= grp_perm | other_perm;
2963                 }
2964                 if (vsa.vsa_aclcnt)
2965                         kmem_free(vsa.vsa_aclentp,
2966                             vsa.vsa_aclcnt * sizeof (aclent_t));
2967         }
2968 }
2969
2970 void
2971 rfs_srvrinit(void)
2972 {
2973         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2974         nfs2_srv_caller_id = fs_new_caller_id();
2975 }
2976
2977 void
2978 rfs_srvrfini(void)
2979 {
2980         mutex_destroy(&rfs_async_write_lock);
2981 }
2982
2983 static int
2984 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2985 {
2986         struct clist    *wcl;
2987         int             wlist_len;
2988         uint32_t        count = rr->rr_count;
2989
2990         wcl = ra->ra_wlist;
2991
2992         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
2993                 return (FALSE);
2994         }
2995
2996         wcl = ra->ra_wlist;
2997         rr->rr_ok.rrok_wlist_len = wlist_len;
2998         rr->rr_ok.rrok_wlist = wcl;
2999
3000         return (TRUE);
3001 }