usr/src/uts/common/fs/nfs/nfs_srv.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2012 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28  *      All rights reserved.
  29  */
  30
  31 #include <sys/param.h>
  32 #include <sys/types.h>
  33 #include <sys/systm.h>
  34 #include <sys/cred.h>
  35 #include <sys/buf.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/uio.h>
  39 #include <sys/stat.h>
  40 #include <sys/errno.h>
  41 #include <sys/sysmacros.h>
  42 #include <sys/statvfs.h>
  43 #include <sys/kmem.h>
  44 #include <sys/kstat.h>
  45 #include <sys/dirent.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/debug.h>
  48 #include <sys/vtrace.h>
  49 #include <sys/mode.h>
  50 #include <sys/acl.h>
  51 #include <sys/nbmlock.h>
  52 #include <sys/policy.h>
  53 #include <sys/sdt.h>
  54
  55 #include <rpc/types.h>
  56 #include <rpc/auth.h>
  57 #include <rpc/svc.h>
  58
  59 #include <nfs/nfs.h>
  60 #include <nfs/export.h>
  61 #include <nfs/nfs_cmd.h>
  62
  63 #include <vm/hat.h>
  64 #include <vm/as.h>
  65 #include <vm/seg.h>
  66 #include <vm/seg_map.h>
  67 #include <vm/seg_kmem.h>
  68
  69 #include <sys/strsubr.h>
  70
  71 /*
  72  * These are the interface routines for the server side of the
  73  * Network File System.  See the NFS version 2 protocol specification
  74  * for a description of this interface.
  75  */
  76
  77 static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  78 static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  79                         cred_t *);
  80
  81 /*
  82  * Some "over the wire" UNIX file types.  These are encoded
  83  * into the mode.  This needs to be fixed in the next rev.
  84  */
  85 #define IFMT            0170000         /* type of file */
  86 #define IFCHR           0020000         /* character special */
  87 #define IFBLK           0060000         /* block special */
  88 #define IFSOCK          0140000         /* socket */
  89
  90 u_longlong_t nfs2_srv_caller_id;
  91
  92 /*
  93  * Get file attributes.
  94  * Returns the current attributes of the file with the given fhandle.
  95  */
  96 /* ARGSUSED */
  97 void
  98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
  99         struct svc_req *req, cred_t *cr)
 100 {
 101         int error;
 102         vnode_t *vp;
 103         struct vattr va;
 104
 105         vp = nfs_fhtovp(fhp, exi);
 106         if (vp == NULL) {
 107                 ns->ns_status = NFSERR_STALE;
 108                 return;
 109         }
 110
 111         /*
 112          * Do the getattr.
 113          */
 114         va.va_mask = AT_ALL;    /* we want all the attributes */
 115
 116         error = rfs4_delegated_getattr(vp, &va, 0, cr);
 117
 118         /* check for overflows */
 119         if (!error) {
 120                 /* Lie about the object type for a referral */
 121                 if (vn_is_nfs_reparse(vp, cr))
 122                         va.va_type = VLNK;
 123
 124                 acl_perm(vp, exi, &va, cr);
 125                 error = vattr_to_nattr(&va, &ns->ns_attr);
 126         }
 127
 128         VN_RELE(vp);
 129
 130         ns->ns_status = puterrno(error);
 131 }
 132 void *
 133 rfs_getattr_getfh(fhandle_t *fhp)
 134 {
 135         return (fhp);
 136 }
 137
 138 /*
 139  * Set file attributes.
 140  * Sets the attributes of the file with the given fhandle.  Returns
 141  * the new attributes.
 142  */
 143 void
 144 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 145         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 146 {
 147         int error;
 148         int flag;
 149         int in_crit = 0;
 150         vnode_t *vp;
 151         struct vattr va;
 152         struct vattr bva;
 153         struct flock64 bf;
 154         caller_context_t ct;
 155
 156
 157         vp = nfs_fhtovp(&args->saa_fh, exi);
 158         if (vp == NULL) {
 159                 ns->ns_status = NFSERR_STALE;
 160                 return;
 161         }
 162
 163         if (rdonly(exi, req) || vn_is_readonly(vp)) {
 164                 VN_RELE(vp);
 165                 ns->ns_status = NFSERR_ROFS;
 166                 return;
 167         }
 168
 169         error = sattr_to_vattr(&args->saa_sa, &va);
 170         if (error) {
 171                 VN_RELE(vp);
 172                 ns->ns_status = puterrno(error);
 173                 return;
 174         }
 175
 176         /*
 177          * If the client is requesting a change to the mtime,
 178          * but the nanosecond field is set to 1 billion, then
 179          * this is a flag to the server that it should set the
 180          * atime and mtime fields to the server's current time.
 181          * The 1 billion number actually came from the client
 182          * as 1 million, but the units in the over the wire
 183          * request are microseconds instead of nanoseconds.
 184          *
 185          * This is an overload of the protocol and should be
 186          * documented in the NFS Version 2 protocol specification.
 187          */
 188         if (va.va_mask & AT_MTIME) {
 189                 if (va.va_mtime.tv_nsec == 1000000000) {
 190                         gethrestime(&va.va_mtime);
 191                         va.va_atime = va.va_mtime;
 192                         va.va_mask |= AT_ATIME;
 193                         flag = 0;
 194                 } else
 195                         flag = ATTR_UTIME;
 196         } else
 197                 flag = 0;
 198
 199         /*
 200          * If the filesystem is exported with nosuid, then mask off
 201          * the setuid and setgid bits.
 202          */
 203         if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 204             (exi->exi_export.ex_flags & EX_NOSUID))
 205                 va.va_mode &= ~(VSUID | VSGID);
 206
 207         ct.cc_sysid = 0;
 208         ct.cc_pid = 0;
 209         ct.cc_caller_id = nfs2_srv_caller_id;
 210         ct.cc_flags = CC_DONTBLOCK;
 211
 212         /*
 213          * We need to specially handle size changes because it is
 214          * possible for the client to create a file with modes
 215          * which indicate read-only, but with the file opened for
 216          * writing.  If the client then tries to set the size of
 217          * the file, then the normal access checking done in
 218          * VOP_SETATTR would prevent the client from doing so,
 219          * although it should be legal for it to do so.  To get
 220          * around this, we do the access checking for ourselves
 221          * and then use VOP_SPACE which doesn't do the access
 222          * checking which VOP_SETATTR does. VOP_SPACE can only
 223          * operate on VREG files, let VOP_SETATTR handle the other
 224          * extremely rare cases.
 225          * Also the client should not be allowed to change the
 226          * size of the file if there is a conflicting non-blocking
 227          * mandatory lock in the region of change.
 228          */
 229         if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 230                 if (nbl_need_check(vp)) {
 231                         nbl_start_crit(vp, RW_READER);
 232                         in_crit = 1;
 233                 }
 234
 235                 bva.va_mask = AT_UID | AT_SIZE;
 236
 237                 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 238
 239                 if (error) {
 240                         if (in_crit)
 241                                 nbl_end_crit(vp);
 242                         VN_RELE(vp);
 243                         ns->ns_status = puterrno(error);
 244                         return;
 245                 }
 246
 247                 if (in_crit) {
 248                         u_offset_t offset;
 249                         ssize_t length;
 250
 251                         if (va.va_size < bva.va_size) {
 252                                 offset = va.va_size;
 253                                 length = bva.va_size - va.va_size;
 254                         } else {
 255                                 offset = bva.va_size;
 256                                 length = va.va_size - bva.va_size;
 257                         }
 258                         if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 259                             NULL)) {
 260                                 error = EACCES;
 261                         }
 262                 }
 263
 264                 if (crgetuid(cr) == bva.va_uid && !error &&
 265                     va.va_size != bva.va_size) {
 266                         va.va_mask &= ~AT_SIZE;
 267                         bf.l_type = F_WRLCK;
 268                         bf.l_whence = 0;
 269                         bf.l_start = (off64_t)va.va_size;
 270                         bf.l_len = 0;
 271                         bf.l_sysid = 0;
 272                         bf.l_pid = 0;
 273
 274                         error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 275                             (offset_t)va.va_size, cr, &ct);
 276                 }
 277                 if (in_crit)
 278                         nbl_end_crit(vp);
 279         } else
 280                 error = 0;
 281
 282         /*
 283          * Do the setattr.
 284          */
 285         if (!error && va.va_mask) {
 286                 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 287         }
 288
 289         /*
 290          * check if the monitor on either vop_space or vop_setattr detected
 291          * a delegation conflict and if so, mark the thread flag as
 292          * wouldblock so that the response is dropped and the client will
 293          * try again.
 294          */
 295         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 296                 VN_RELE(vp);
 297                 curthread->t_flag |= T_WOULDBLOCK;
 298                 return;
 299         }
 300
 301         if (!error) {
 302                 va.va_mask = AT_ALL;    /* get everything */
 303
 304                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 305
 306                 /* check for overflows */
 307                 if (!error) {
 308                         acl_perm(vp, exi, &va, cr);
 309                         error = vattr_to_nattr(&va, &ns->ns_attr);
 310                 }
 311         }
 312
 313         ct.cc_flags = 0;
 314
 315         /*
 316          * Force modified metadata out to stable storage.
 317          */
 318         (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 319
 320         VN_RELE(vp);
 321
 322         ns->ns_status = puterrno(error);
 323 }
 324 void *
 325 rfs_setattr_getfh(struct nfssaargs *args)
 326 {
 327         return (&args->saa_fh);
 328 }
 329
 330 /*
 331  * Directory lookup.
 332  * Returns an fhandle and file attributes for file name in a directory.
 333  */
 334 /* ARGSUSED */
 335 void
 336 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 337         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 338 {
 339         int error;
 340         vnode_t *dvp;
 341         vnode_t *vp;
 342         struct vattr va;
 343         fhandle_t *fhp = da->da_fhandle;
 344         struct sec_ol sec = {0, 0};
 345         bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 346         char *name;
 347         struct sockaddr *ca;
 348
 349         /*
 350          * Trusted Extension doesn't support NFSv2. MOUNT
 351          * will reject v2 clients. Need to prevent v2 client
 352          * access via WebNFS here.
 353          */
 354         if (is_system_labeled() && req->rq_vers == 2) {
 355                 dr->dr_status = NFSERR_ACCES;
 356                 return;
 357         }
 358
 359         /*
 360          * Disallow NULL paths
 361          */
 362         if (da->da_name == NULL || *da->da_name == '\0') {
 363                 dr->dr_status = NFSERR_ACCES;
 364                 return;
 365         }
 366
 367         /*
 368          * Allow lookups from the root - the default
 369          * location of the public filehandle.
 370          */
 371         if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 372                 dvp = rootdir;
 373                 VN_HOLD(dvp);
 374         } else {
 375                 dvp = nfs_fhtovp(fhp, exi);
 376                 if (dvp == NULL) {
 377                         dr->dr_status = NFSERR_STALE;
 378                         return;
 379                 }
 380         }
 381
 382         /*
 383          * Not allow lookup beyond root.
 384          * If the filehandle matches a filehandle of the exi,
 385          * then the ".." refers beyond the root of an exported filesystem.
 386          */
 387         if (strcmp(da->da_name, "..") == 0 &&
 388             EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 389                 VN_RELE(dvp);
 390                 dr->dr_status = NFSERR_NOENT;
 391                 return;
 392         }
 393
 394         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 395         name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 396             MAXPATHLEN);
 397
 398         if (name == NULL) {
 399                 dr->dr_status = NFSERR_ACCES;
 400                 return;
 401         }
 402
 403         exi_hold(exi);
 404
 405         /*
 406          * If the public filehandle is used then allow
 407          * a multi-component lookup, i.e. evaluate
 408          * a pathname and follow symbolic links if
 409          * necessary.
 410          *
 411          * This may result in a vnode in another filesystem
 412          * which is OK as long as the filesystem is exported.
 413          */
 414         if (PUBLIC_FH2(fhp)) {
 415                 struct exportinfo *new;
 416
 417                 publicfh_flag = TRUE;
 418                 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &new,
 419                     &sec);
 420
 421                 if (error == 0) {
 422                         exi_rele(exi);
 423                         exi = new;
 424                 }
 425         } else {
 426                 /*
 427                  * Do a normal single component lookup.
 428                  */
 429                 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 430                     NULL, NULL, NULL);
 431         }
 432
 433         if (name != da->da_name)
 434                 kmem_free(name, MAXPATHLEN);
 435
 436
 437         if (!error) {
 438                 va.va_mask = AT_ALL;    /* we want everything */
 439
 440                 error = rfs4_delegated_getattr(vp, &va, 0, cr);
 441
 442                 /* check for overflows */
 443                 if (!error) {
 444                         acl_perm(vp, exi, &va, cr);
 445                         error = vattr_to_nattr(&va, &dr->dr_attr);
 446                         if (!error) {
 447                                 if (sec.sec_flags & SEC_QUERY)
 448                                         error = makefh_ol(&dr->dr_fhandle, exi,
 449                                             sec.sec_index);
 450                                 else {
 451                                         error = makefh(&dr->dr_fhandle, vp,
 452                                             exi);
 453                                         if (!error && publicfh_flag &&
 454                                             !chk_clnt_sec(exi, req))
 455                                                 auth_weak = TRUE;
 456                                 }
 457                         }
 458                 }
 459                 VN_RELE(vp);
 460         }
 461
 462         VN_RELE(dvp);
 463
 464         /*
 465          * The passed argument exportinfo is released by the
 466          * caller, comon_dispatch
 467          */
 468         exi_rele(exi);
 469
 470         /*
 471          * If it's public fh, no 0x81, and client's flavor is
 472          * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 473          * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 474          */
 475         if (auth_weak)
 476                 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 477         else
 478                 dr->dr_status = puterrno(error);
 479 }
 480 void *
 481 rfs_lookup_getfh(struct nfsdiropargs *da)
 482 {
 483         return (da->da_fhandle);
 484 }
 485
 486 /*
 487  * Read symbolic link.
 488  * Returns the string in the symbolic link at the given fhandle.
 489  */
 490 /* ARGSUSED */
 491 void
 492 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 493         struct svc_req *req, cred_t *cr)
 494 {
 495         int error;
 496         struct iovec iov;
 497         struct uio uio;
 498         vnode_t *vp;
 499         struct vattr va;
 500         struct sockaddr *ca;
 501         char *name = NULL;
 502         int is_referral = 0;
 503
 504         vp = nfs_fhtovp(fhp, exi);
 505         if (vp == NULL) {
 506                 rl->rl_data = NULL;
 507                 rl->rl_status = NFSERR_STALE;
 508                 return;
 509         }
 510
 511         va.va_mask = AT_MODE;
 512
 513         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 514
 515         if (error) {
 516                 VN_RELE(vp);
 517                 rl->rl_data = NULL;
 518                 rl->rl_status = puterrno(error);
 519                 return;
 520         }
 521
 522         if (MANDLOCK(vp, va.va_mode)) {
 523                 VN_RELE(vp);
 524                 rl->rl_data = NULL;
 525                 rl->rl_status = NFSERR_ACCES;
 526                 return;
 527         }
 528
 529         /* We lied about the object type for a referral */
 530         if (vn_is_nfs_reparse(vp, cr))
 531                 is_referral = 1;
 532
 533         /*
 534          * XNFS and RFC1094 require us to return ENXIO if argument
 535          * is not a link. BUGID 1138002.
 536          */
 537         if (vp->v_type != VLNK && !is_referral) {
 538                 VN_RELE(vp);
 539                 rl->rl_data = NULL;
 540                 rl->rl_status = NFSERR_NXIO;
 541                 return;
 542         }
 543
 544         /*
 545          * Allocate data for pathname.  This will be freed by rfs_rlfree.
 546          */
 547         rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 548
 549         if (is_referral) {
 550                 char *s;
 551                 size_t strsz;
 552
 553                 /* Get an artificial symlink based on a referral */
 554                 s = build_symlink(vp, cr, &strsz);
 555                 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 556                 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 557                     vnode_t *, vp, char *, s);
 558                 if (s == NULL)
 559                         error = EINVAL;
 560                 else {
 561                         error = 0;
 562                         (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 563                         rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 564                         kmem_free(s, strsz);
 565                 }
 566
 567         } else {
 568
 569                 /*
 570                  * Set up io vector to read sym link data
 571                  */
 572                 iov.iov_base = rl->rl_data;
 573                 iov.iov_len = NFS_MAXPATHLEN;
 574                 uio.uio_iov = &iov;
 575                 uio.uio_iovcnt = 1;
 576                 uio.uio_segflg = UIO_SYSSPACE;
 577                 uio.uio_extflg = UIO_COPY_CACHED;
 578                 uio.uio_loffset = (offset_t)0;
 579                 uio.uio_resid = NFS_MAXPATHLEN;
 580
 581                 /*
 582                  * Do the readlink.
 583                  */
 584                 error = VOP_READLINK(vp, &uio, cr, NULL);
 585
 586                 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 587
 588                 if (!error)
 589                         rl->rl_data[rl->rl_count] = '\0';
 590
 591         }
 592
 593
 594         VN_RELE(vp);
 595
 596         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 597         name = nfscmd_convname(ca, exi, rl->rl_data,
 598             NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 599
 600         if (name != NULL && name != rl->rl_data) {
 601                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 602                 rl->rl_data = name;
 603         }
 604
 605         /*
 606          * XNFS and RFC1094 require us to return ENXIO if argument
 607          * is not a link. UFS returns EINVAL if this is the case,
 608          * so we do the mapping here. BUGID 1138002.
 609          */
 610         if (error == EINVAL)
 611                 rl->rl_status = NFSERR_NXIO;
 612         else
 613                 rl->rl_status = puterrno(error);
 614
 615 }
 616 void *
 617 rfs_readlink_getfh(fhandle_t *fhp)
 618 {
 619         return (fhp);
 620 }
 621 /*
 622  * Free data allocated by rfs_readlink
 623  */
 624 void
 625 rfs_rlfree(struct nfsrdlnres *rl)
 626 {
 627         if (rl->rl_data != NULL)
 628                 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 629 }
 630
 631 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 632
 633 /*
 634  * Read data.
 635  * Returns some data read from the file at the given fhandle.
 636  */
 637 /* ARGSUSED */
 638 void
 639 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 640         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 641 {
 642         vnode_t *vp;
 643         int error;
 644         struct vattr va;
 645         struct iovec iov;
 646         struct uio uio;
 647         mblk_t *mp;
 648         int alloc_err = 0;
 649         int in_crit = 0;
 650         caller_context_t ct;
 651
 652         vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 653         if (vp == NULL) {
 654                 rr->rr_data = NULL;
 655                 rr->rr_status = NFSERR_STALE;
 656                 return;
 657         }
 658
 659         if (vp->v_type != VREG) {
 660                 VN_RELE(vp);
 661                 rr->rr_data = NULL;
 662                 rr->rr_status = NFSERR_ISDIR;
 663                 return;
 664         }
 665
 666         ct.cc_sysid = 0;
 667         ct.cc_pid = 0;
 668         ct.cc_caller_id = nfs2_srv_caller_id;
 669         ct.cc_flags = CC_DONTBLOCK;
 670
 671         /*
 672          * Enter the critical region before calling VOP_RWLOCK
 673          * to avoid a deadlock with write requests.
 674          */
 675         if (nbl_need_check(vp)) {
 676                 nbl_start_crit(vp, RW_READER);
 677                 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 678                     0, NULL)) {
 679                         nbl_end_crit(vp);
 680                         VN_RELE(vp);
 681                         rr->rr_data = NULL;
 682                         rr->rr_status = NFSERR_ACCES;
 683                         return;
 684                 }
 685                 in_crit = 1;
 686         }
 687
 688         error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 689
 690         /* check if a monitor detected a delegation conflict */
 691         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 692                 VN_RELE(vp);
 693                 /* mark as wouldblock so response is dropped */
 694                 curthread->t_flag |= T_WOULDBLOCK;
 695
 696                 rr->rr_data = NULL;
 697                 return;
 698         }
 699
 700         va.va_mask = AT_ALL;
 701
 702         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 703
 704         if (error) {
 705                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 706                 if (in_crit)
 707                         nbl_end_crit(vp);
 708
 709                 VN_RELE(vp);
 710                 rr->rr_data = NULL;
 711                 rr->rr_status = puterrno(error);
 712
 713                 return;
 714         }
 715
 716         /*
 717          * This is a kludge to allow reading of files created
 718          * with no read permission.  The owner of the file
 719          * is always allowed to read it.
 720          */
 721         if (crgetuid(cr) != va.va_uid) {
 722                 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 723
 724                 if (error) {
 725                         /*
 726                          * Exec is the same as read over the net because
 727                          * of demand loading.
 728                          */
 729                         error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 730                 }
 731                 if (error) {
 732                         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 733                         if (in_crit)
 734                                 nbl_end_crit(vp);
 735                         VN_RELE(vp);
 736                         rr->rr_data = NULL;
 737                         rr->rr_status = puterrno(error);
 738
 739                         return;
 740                 }
 741         }
 742
 743         if (MANDLOCK(vp, va.va_mode)) {
 744                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 745                 if (in_crit)
 746                         nbl_end_crit(vp);
 747
 748                 VN_RELE(vp);
 749                 rr->rr_data = NULL;
 750                 rr->rr_status = NFSERR_ACCES;
 751
 752                 return;
 753         }
 754
 755         rr->rr_ok.rrok_wlist_len = 0;
 756         rr->rr_ok.rrok_wlist = NULL;
 757
 758         if ((u_offset_t)ra->ra_offset >= va.va_size) {
 759                 rr->rr_count = 0;
 760                 rr->rr_data = NULL;
 761                 /*
 762                  * In this case, status is NFS_OK, but there is no data
 763                  * to encode. So set rr_mp to NULL.
 764                  */
 765                 rr->rr_mp = NULL;
 766                 rr->rr_ok.rrok_wlist = ra->ra_wlist;
 767                 if (rr->rr_ok.rrok_wlist)
 768                         clist_zero_len(rr->rr_ok.rrok_wlist);
 769                 goto done;
 770         }
 771
 772         if (ra->ra_wlist) {
 773                 mp = NULL;
 774                 rr->rr_mp = NULL;
 775                 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 776                 if (ra->ra_count > iov.iov_len) {
 777                         rr->rr_data = NULL;
 778                         rr->rr_status = NFSERR_INVAL;
 779                         goto done;
 780                 }
 781         } else {
 782                 /*
 783                  * mp will contain the data to be sent out in the read reply.
 784                  * This will be freed after the reply has been sent out (by the
 785                  * driver).
 786                  * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 787                  * that the call to xdrmblk_putmblk() never fails.
 788                  */
 789                 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 790                     &alloc_err);
 791                 ASSERT(mp != NULL);
 792                 ASSERT(alloc_err == 0);
 793
 794                 rr->rr_mp = mp;
 795
 796                 /*
 797                  * Set up io vector
 798                  */
 799                 iov.iov_base = (caddr_t)mp->b_datap->db_base;
 800                 iov.iov_len = ra->ra_count;
 801         }
 802
 803         uio.uio_iov = &iov;
 804         uio.uio_iovcnt = 1;
 805         uio.uio_segflg = UIO_SYSSPACE;
 806         uio.uio_extflg = UIO_COPY_CACHED;
 807         uio.uio_loffset = (offset_t)ra->ra_offset;
 808         uio.uio_resid = ra->ra_count;
 809
 810         error = VOP_READ(vp, &uio, 0, cr, &ct);
 811
 812         if (error) {
 813                 if (mp)
 814                         freeb(mp);
 815
 816                 /*
 817                  * check if a monitor detected a delegation conflict and
 818                  * mark as wouldblock so response is dropped
 819                  */
 820                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 821                         curthread->t_flag |= T_WOULDBLOCK;
 822                 else
 823                         rr->rr_status = puterrno(error);
 824
 825                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 826                 if (in_crit)
 827                         nbl_end_crit(vp);
 828
 829                 VN_RELE(vp);
 830                 rr->rr_data = NULL;
 831
 832                 return;
 833         }
 834
 835         /*
 836          * Get attributes again so we can send the latest access
 837          * time to the client side for his cache.
 838          */
 839         va.va_mask = AT_ALL;
 840
 841         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 842
 843         if (error) {
 844                 if (mp)
 845                         freeb(mp);
 846
 847                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 848                 if (in_crit)
 849                         nbl_end_crit(vp);
 850
 851                 VN_RELE(vp);
 852                 rr->rr_data = NULL;
 853                 rr->rr_status = puterrno(error);
 854
 855                 return;
 856         }
 857
 858         rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 859
 860         if (mp) {
 861                 rr->rr_data = (char *)mp->b_datap->db_base;
 862         } else {
 863                 if (ra->ra_wlist) {
 864                         rr->rr_data = (caddr_t)iov.iov_base;
 865                         if (!rdma_setup_read_data2(ra, rr)) {
 866                                 rr->rr_data = NULL;
 867                                 rr->rr_status = puterrno(NFSERR_INVAL);
 868                         }
 869                 }
 870         }
 871 done:
 872         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 873         if (in_crit)
 874                 nbl_end_crit(vp);
 875
 876         acl_perm(vp, exi, &va, cr);
 877
 878         /* check for overflows */
 879         error = vattr_to_nattr(&va, &rr->rr_attr);
 880
 881         VN_RELE(vp);
 882
 883         rr->rr_status = puterrno(error);
 884 }
 885
 886 /*
 887  * Free data allocated by rfs_read
 888  */
 889 void
 890 rfs_rdfree(struct nfsrdresult *rr)
 891 {
 892         mblk_t *mp;
 893
 894         if (rr->rr_status == NFS_OK) {
 895                 mp = rr->rr_mp;
 896                 if (mp != NULL)
 897                         freeb(mp);
 898         }
 899 }
 900
 901 void *
 902 rfs_read_getfh(struct nfsreadargs *ra)
 903 {
 904         return (&ra->ra_fhandle);
 905 }
 906
 907 #define MAX_IOVECS      12
 908
 909 #ifdef DEBUG
 910 static int rfs_write_sync_hits = 0;
 911 static int rfs_write_sync_misses = 0;
 912 #endif
 913
 914 /*
 915  * Write data to file.
 916  * Returns attributes of a file after writing some data to it.
 917  *
 918  * Any changes made here, especially in error handling might have
 919  * to also be done in rfs_write (which clusters write requests).
 920  */
 921 void
 922 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
 923         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
 924 {
 925         int error;
 926         vnode_t *vp;
 927         rlim64_t rlimit;
 928         struct vattr va;
 929         struct uio uio;
 930         struct iovec iov[MAX_IOVECS];
 931         mblk_t *m;
 932         struct iovec *iovp;
 933         int iovcnt;
 934         cred_t *savecred;
 935         int in_crit = 0;
 936         caller_context_t ct;
 937
 938         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
 939         if (vp == NULL) {
 940                 ns->ns_status = NFSERR_STALE;
 941                 return;
 942         }
 943
 944         if (rdonly(exi, req)) {
 945                 VN_RELE(vp);
 946                 ns->ns_status = NFSERR_ROFS;
 947                 return;
 948         }
 949
 950         if (vp->v_type != VREG) {
 951                 VN_RELE(vp);
 952                 ns->ns_status = NFSERR_ISDIR;
 953                 return;
 954         }
 955
 956         ct.cc_sysid = 0;
 957         ct.cc_pid = 0;
 958         ct.cc_caller_id = nfs2_srv_caller_id;
 959         ct.cc_flags = CC_DONTBLOCK;
 960
 961         va.va_mask = AT_UID|AT_MODE;
 962
 963         error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 964
 965         if (error) {
 966                 VN_RELE(vp);
 967                 ns->ns_status = puterrno(error);
 968
 969                 return;
 970         }
 971
 972         if (crgetuid(cr) != va.va_uid) {
 973                 /*
 974                  * This is a kludge to allow writes of files created
 975                  * with read only permission.  The owner of the file
 976                  * is always allowed to write it.
 977                  */
 978                 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
 979
 980                 if (error) {
 981                         VN_RELE(vp);
 982                         ns->ns_status = puterrno(error);
 983                         return;
 984                 }
 985         }
 986
 987         /*
 988          * Can't access a mandatory lock file.  This might cause
 989          * the NFS service thread to block forever waiting for a
 990          * lock to be released that will never be released.
 991          */
 992         if (MANDLOCK(vp, va.va_mode)) {
 993                 VN_RELE(vp);
 994                 ns->ns_status = NFSERR_ACCES;
 995                 return;
 996         }
 997
 998         /*
 999          * We have to enter the critical region before calling VOP_RWLOCK
1000          * to avoid a deadlock with ufs.
1001          */
1002         if (nbl_need_check(vp)) {
1003                 nbl_start_crit(vp, RW_READER);
1004                 in_crit = 1;
1005                 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1006                     wa->wa_count, 0, NULL)) {
1007                         error = EACCES;
1008                         goto out;
1009                 }
1010         }
1011
1012         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1013
1014         /* check if a monitor detected a delegation conflict */
1015         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1016                 VN_RELE(vp);
1017                 /* mark as wouldblock so response is dropped */
1018                 curthread->t_flag |= T_WOULDBLOCK;
1019                 return;
1020         }
1021
1022         if (wa->wa_data || wa->wa_rlist) {
1023                 /* Do the RDMA thing if necessary */
1024                 if (wa->wa_rlist) {
1025                         iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1026                         iov[0].iov_len = wa->wa_count;
1027                 } else  {
1028                         iov[0].iov_base = wa->wa_data;
1029                         iov[0].iov_len = wa->wa_count;
1030                 }
1031                 uio.uio_iov = iov;
1032                 uio.uio_iovcnt = 1;
1033                 uio.uio_segflg = UIO_SYSSPACE;
1034                 uio.uio_extflg = UIO_COPY_DEFAULT;
1035                 uio.uio_loffset = (offset_t)wa->wa_offset;
1036                 uio.uio_resid = wa->wa_count;
1037                 /*
1038                  * The limit is checked on the client. We
1039                  * should allow any size writes here.
1040                  */
1041                 uio.uio_llimit = curproc->p_fsz_ctl;
1042                 rlimit = uio.uio_llimit - wa->wa_offset;
1043                 if (rlimit < (rlim64_t)uio.uio_resid)
1044                         uio.uio_resid = (uint_t)rlimit;
1045
1046                 /*
1047                  * for now we assume no append mode
1048                  */
1049                 /*
1050                  * We're changing creds because VM may fault and we need
1051                  * the cred of the current thread to be used if quota
1052                  * checking is enabled.
1053                  */
1054                 savecred = curthread->t_cred;
1055                 curthread->t_cred = cr;
1056                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1057                 curthread->t_cred = savecred;
1058         } else {
1059                 iovcnt = 0;
1060                 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1061                         iovcnt++;
1062                 if (iovcnt <= MAX_IOVECS) {
1063 #ifdef DEBUG
1064                         rfs_write_sync_hits++;
1065 #endif
1066                         iovp = iov;
1067                 } else {
1068 #ifdef DEBUG
1069                         rfs_write_sync_misses++;
1070 #endif
1071                         iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1072                 }
1073                 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1074                 uio.uio_iov = iovp;
1075                 uio.uio_iovcnt = iovcnt;
1076                 uio.uio_segflg = UIO_SYSSPACE;
1077                 uio.uio_extflg = UIO_COPY_DEFAULT;
1078                 uio.uio_loffset = (offset_t)wa->wa_offset;
1079                 uio.uio_resid = wa->wa_count;
1080                 /*
1081                  * The limit is checked on the client. We
1082                  * should allow any size writes here.
1083                  */
1084                 uio.uio_llimit = curproc->p_fsz_ctl;
1085                 rlimit = uio.uio_llimit - wa->wa_offset;
1086                 if (rlimit < (rlim64_t)uio.uio_resid)
1087                         uio.uio_resid = (uint_t)rlimit;
1088
1089                 /*
1090                  * For now we assume no append mode.
1091                  */
1092                 /*
1093                  * We're changing creds because VM may fault and we need
1094                  * the cred of the current thread to be used if quota
1095                  * checking is enabled.
1096                  */
1097                 savecred = curthread->t_cred;
1098                 curthread->t_cred = cr;
1099                 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1100                 curthread->t_cred = savecred;
1101
1102                 if (iovp != iov)
1103                         kmem_free(iovp, sizeof (*iovp) * iovcnt);
1104         }
1105
1106         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1107
1108         if (!error) {
1109                 /*
1110                  * Get attributes again so we send the latest mod
1111                  * time to the client side for his cache.
1112                  */
1113                 va.va_mask = AT_ALL;    /* now we want everything */
1114
1115                 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1116
1117                 /* check for overflows */
1118                 if (!error) {
1119                         acl_perm(vp, exi, &va, cr);
1120                         error = vattr_to_nattr(&va, &ns->ns_attr);
1121                 }
1122         }
1123
1124 out:
1125         if (in_crit)
1126                 nbl_end_crit(vp);
1127         VN_RELE(vp);
1128
1129         /* check if a monitor detected a delegation conflict */
1130         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1131                 /* mark as wouldblock so response is dropped */
1132                 curthread->t_flag |= T_WOULDBLOCK;
1133         else
1134                 ns->ns_status = puterrno(error);
1135
1136 }
1137
1138 struct rfs_async_write {
1139         struct nfswriteargs *wa;
1140         struct nfsattrstat *ns;
1141         struct svc_req *req;
1142         cred_t *cr;
1143         kthread_t *thread;
1144         struct rfs_async_write *list;
1145 };
1146
1147 struct rfs_async_write_list {
1148         fhandle_t *fhp;
1149         kcondvar_t cv;
1150         struct rfs_async_write *list;
1151         struct rfs_async_write_list *next;
1152 };
1153
1154 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1155 static kmutex_t rfs_async_write_lock;
1156 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1157
1158 #define MAXCLIOVECS     42
1159 #define RFSWRITE_INITVAL (enum nfsstat) -1
1160
1161 #ifdef DEBUG
1162 static int rfs_write_hits = 0;
1163 static int rfs_write_misses = 0;
1164 #endif
1165
1166 /*
1167  * Write data to file.
1168  * Returns attributes of a file after writing some data to it.
1169  */
1170 void
1171 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1172         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1173 {
1174         int error;
1175         vnode_t *vp;
1176         rlim64_t rlimit;
1177         struct vattr va;
1178         struct uio uio;
1179         struct rfs_async_write_list *lp;
1180         struct rfs_async_write_list *nlp;
1181         struct rfs_async_write *rp;
1182         struct rfs_async_write *nrp;
1183         struct rfs_async_write *trp;
1184         struct rfs_async_write *lrp;
1185         int data_written;
1186         int iovcnt;
1187         mblk_t *m;
1188         struct iovec *iovp;
1189         struct iovec *niovp;
1190         struct iovec iov[MAXCLIOVECS];
1191         int count;
1192         int rcount;
1193         uint_t off;
1194         uint_t len;
1195         struct rfs_async_write nrpsp;
1196         struct rfs_async_write_list nlpsp;
1197         ushort_t t_flag;
1198         cred_t *savecred;
1199         int in_crit = 0;
1200         caller_context_t ct;
1201
1202         if (!rfs_write_async) {
1203                 rfs_write_sync(wa, ns, exi, req, cr);
1204                 return;
1205         }
1206
1207         /*
1208          * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1209          * is considered an OK.
1210          */
1211         ns->ns_status = RFSWRITE_INITVAL;
1212
1213         nrp = &nrpsp;
1214         nrp->wa = wa;
1215         nrp->ns = ns;
1216         nrp->req = req;
1217         nrp->cr = cr;
1218         nrp->thread = curthread;
1219
1220         ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1221
1222         /*
1223          * Look to see if there is already a cluster started
1224          * for this file.
1225          */
1226         mutex_enter(&rfs_async_write_lock);
1227         for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1228                 if (bcmp(&wa->wa_fhandle, lp->fhp,
1229                     sizeof (fhandle_t)) == 0)
1230                         break;
1231         }
1232
1233         /*
1234          * If lp is non-NULL, then there is already a cluster
1235          * started.  We need to place ourselves in the cluster
1236          * list in the right place as determined by starting
1237          * offset.  Conflicts with non-blocking mandatory locked
1238          * regions will be checked when the cluster is processed.
1239          */
1240         if (lp != NULL) {
1241                 rp = lp->list;
1242                 trp = NULL;
1243                 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1244                         trp = rp;
1245                         rp = rp->list;
1246                 }
1247                 nrp->list = rp;
1248                 if (trp == NULL)
1249                         lp->list = nrp;
1250                 else
1251                         trp->list = nrp;
1252                 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1253                         cv_wait(&lp->cv, &rfs_async_write_lock);
1254                 mutex_exit(&rfs_async_write_lock);
1255
1256                 return;
1257         }
1258
1259         /*
1260          * No cluster started yet, start one and add ourselves
1261          * to the list of clusters.
1262          */
1263         nrp->list = NULL;
1264
1265         nlp = &nlpsp;
1266         nlp->fhp = &wa->wa_fhandle;
1267         cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1268         nlp->list = nrp;
1269         nlp->next = NULL;
1270
1271         if (rfs_async_write_head == NULL) {
1272                 rfs_async_write_head = nlp;
1273         } else {
1274                 lp = rfs_async_write_head;
1275                 while (lp->next != NULL)
1276                         lp = lp->next;
1277                 lp->next = nlp;
1278         }
1279         mutex_exit(&rfs_async_write_lock);
1280
1281         /*
1282          * Convert the file handle common to all of the requests
1283          * in this cluster to a vnode.
1284          */
1285         vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1286         if (vp == NULL) {
1287                 mutex_enter(&rfs_async_write_lock);
1288                 if (rfs_async_write_head == nlp)
1289                         rfs_async_write_head = nlp->next;
1290                 else {
1291                         lp = rfs_async_write_head;
1292                         while (lp->next != nlp)
1293                                 lp = lp->next;
1294                         lp->next = nlp->next;
1295                 }
1296                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1297                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1298                         rp->ns->ns_status = NFSERR_STALE;
1299                         rp->thread->t_flag |= t_flag;
1300                 }
1301                 cv_broadcast(&nlp->cv);
1302                 mutex_exit(&rfs_async_write_lock);
1303
1304                 return;
1305         }
1306
1307         /*
1308          * Can only write regular files.  Attempts to write any
1309          * other file types fail with EISDIR.
1310          */
1311         if (vp->v_type != VREG) {
1312                 VN_RELE(vp);
1313                 mutex_enter(&rfs_async_write_lock);
1314                 if (rfs_async_write_head == nlp)
1315                         rfs_async_write_head = nlp->next;
1316                 else {
1317                         lp = rfs_async_write_head;
1318                         while (lp->next != nlp)
1319                                 lp = lp->next;
1320                         lp->next = nlp->next;
1321                 }
1322                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1323                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1324                         rp->ns->ns_status = NFSERR_ISDIR;
1325                         rp->thread->t_flag |= t_flag;
1326                 }
1327                 cv_broadcast(&nlp->cv);
1328                 mutex_exit(&rfs_async_write_lock);
1329
1330                 return;
1331         }
1332
1333         /*
1334          * Enter the critical region before calling VOP_RWLOCK, to avoid a
1335          * deadlock with ufs.
1336          */
1337         if (nbl_need_check(vp)) {
1338                 nbl_start_crit(vp, RW_READER);
1339                 in_crit = 1;
1340         }
1341
1342         ct.cc_sysid = 0;
1343         ct.cc_pid = 0;
1344         ct.cc_caller_id = nfs2_srv_caller_id;
1345         ct.cc_flags = CC_DONTBLOCK;
1346
1347         /*
1348          * Lock the file for writing.  This operation provides
1349          * the delay which allows clusters to grow.
1350          */
1351         error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1352
1353         /* check if a monitor detected a delegation conflict */
1354         if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1355                 if (in_crit)
1356                         nbl_end_crit(vp);
1357                 VN_RELE(vp);
1358                 /* mark as wouldblock so response is dropped */
1359                 curthread->t_flag |= T_WOULDBLOCK;
1360                 mutex_enter(&rfs_async_write_lock);
1361                 if (rfs_async_write_head == nlp)
1362                         rfs_async_write_head = nlp->next;
1363                 else {
1364                         lp = rfs_async_write_head;
1365                         while (lp->next != nlp)
1366                                 lp = lp->next;
1367                         lp->next = nlp->next;
1368                 }
1369                 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1370                         if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1371                                 rp->ns->ns_status = puterrno(error);
1372                                 rp->thread->t_flag |= T_WOULDBLOCK;
1373                         }
1374                 }
1375                 cv_broadcast(&nlp->cv);
1376                 mutex_exit(&rfs_async_write_lock);
1377
1378                 return;
1379         }
1380
1381         /*
1382          * Disconnect this cluster from the list of clusters.
1383          * The cluster that is being dealt with must be fixed
1384          * in size after this point, so there is no reason
1385          * to leave it on the list so that new requests can
1386          * find it.
1387          *
1388          * The algorithm is that the first write request will
1389          * create a cluster, convert the file handle to a
1390          * vnode pointer, and then lock the file for writing.
1391          * This request is not likely to be clustered with
1392          * any others.  However, the next request will create
1393          * a new cluster and be blocked in VOP_RWLOCK while
1394          * the first request is being processed.  This delay
1395          * will allow more requests to be clustered in this
1396          * second cluster.
1397          */
1398         mutex_enter(&rfs_async_write_lock);
1399         if (rfs_async_write_head == nlp)
1400                 rfs_async_write_head = nlp->next;
1401         else {
1402                 lp = rfs_async_write_head;
1403                 while (lp->next != nlp)
1404                         lp = lp->next;
1405                 lp->next = nlp->next;
1406         }
1407         mutex_exit(&rfs_async_write_lock);
1408
1409         /*
1410          * Step through the list of requests in this cluster.
1411          * We need to check permissions to make sure that all
1412          * of the requests have sufficient permission to write
1413          * the file.  A cluster can be composed of requests
1414          * from different clients and different users on each
1415          * client.
1416          *
1417          * As a side effect, we also calculate the size of the
1418          * byte range that this cluster encompasses.
1419          */
1420         rp = nlp->list;
1421         off = rp->wa->wa_offset;
1422         len = (uint_t)0;
1423         do {
1424                 if (rdonly(exi, rp->req)) {
1425                         rp->ns->ns_status = NFSERR_ROFS;
1426                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1427                         rp->thread->t_flag |= t_flag;
1428                         continue;
1429                 }
1430
1431                 va.va_mask = AT_UID|AT_MODE;
1432
1433                 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1434
1435                 if (!error) {
1436                         if (crgetuid(rp->cr) != va.va_uid) {
1437                                 /*
1438                                  * This is a kludge to allow writes of files
1439                                  * created with read only permission.  The
1440                                  * owner of the file is always allowed to
1441                                  * write it.
1442                                  */
1443                                 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1444                         }
1445                         if (!error && MANDLOCK(vp, va.va_mode))
1446                                 error = EACCES;
1447                 }
1448
1449                 /*
1450                  * Check for a conflict with a nbmand-locked region.
1451                  */
1452                 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1453                     rp->wa->wa_count, 0, NULL)) {
1454                         error = EACCES;
1455                 }
1456
1457                 if (error) {
1458                         rp->ns->ns_status = puterrno(error);
1459                         t_flag = curthread->t_flag & T_WOULDBLOCK;
1460                         rp->thread->t_flag |= t_flag;
1461                         continue;
1462                 }
1463                 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1464                         len = rp->wa->wa_offset + rp->wa->wa_count - off;
1465         } while ((rp = rp->list) != NULL);
1466
1467         /*
1468          * Step through the cluster attempting to gather as many
1469          * requests which are contiguous as possible.  These
1470          * contiguous requests are handled via one call to VOP_WRITE
1471          * instead of different calls to VOP_WRITE.  We also keep
1472          * track of the fact that any data was written.
1473          */
1474         rp = nlp->list;
1475         data_written = 0;
1476         do {
1477                 /*
1478                  * Skip any requests which are already marked as having an
1479                  * error.
1480                  */
1481                 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1482                         rp = rp->list;
1483                         continue;
1484                 }
1485
1486                 /*
1487                  * Count the number of iovec's which are required
1488                  * to handle this set of requests.  One iovec is
1489                  * needed for each data buffer, whether addressed
1490                  * by wa_data or by the b_rptr pointers in the
1491                  * mblk chains.
1492                  */
1493                 iovcnt = 0;
1494                 lrp = rp;
1495                 for (;;) {
1496                         if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1497                                 iovcnt++;
1498                         else {
1499                                 m = lrp->wa->wa_mblk;
1500                                 while (m != NULL) {
1501                                         iovcnt++;
1502                                         m = m->b_cont;
1503                                 }
1504                         }
1505                         if (lrp->list == NULL ||
1506                             lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1507                             lrp->wa->wa_offset + lrp->wa->wa_count !=
1508                             lrp->list->wa->wa_offset) {
1509                                 lrp = lrp->list;
1510                                 break;
1511                         }
1512                         lrp = lrp->list;
1513                 }
1514
1515                 if (iovcnt <= MAXCLIOVECS) {
1516 #ifdef DEBUG
1517                         rfs_write_hits++;
1518 #endif
1519                         niovp = iov;
1520                 } else {
1521 #ifdef DEBUG
1522                         rfs_write_misses++;
1523 #endif
1524                         niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1525                 }
1526                 /*
1527                  * Put together the scatter/gather iovecs.
1528                  */
1529                 iovp = niovp;
1530                 trp = rp;
1531                 count = 0;
1532                 do {
1533                         if (trp->wa->wa_data || trp->wa->wa_rlist) {
1534                                 if (trp->wa->wa_rlist) {
1535                                         iovp->iov_base =
1536                                             (char *)((trp->wa->wa_rlist)->
1537                                             u.c_daddr3);
1538                                         iovp->iov_len = trp->wa->wa_count;
1539                                 } else  {
1540                                         iovp->iov_base = trp->wa->wa_data;
1541                                         iovp->iov_len = trp->wa->wa_count;
1542                                 }
1543                                 iovp++;
1544                         } else {
1545                                 m = trp->wa->wa_mblk;
1546                                 rcount = trp->wa->wa_count;
1547                                 while (m != NULL) {
1548                                         iovp->iov_base = (caddr_t)m->b_rptr;
1549                                         iovp->iov_len = (m->b_wptr - m->b_rptr);
1550                                         rcount -= iovp->iov_len;
1551                                         if (rcount < 0)
1552                                                 iovp->iov_len += rcount;
1553                                         iovp++;
1554                                         if (rcount <= 0)
1555                                                 break;
1556                                         m = m->b_cont;
1557                                 }
1558                         }
1559                         count += trp->wa->wa_count;
1560                         trp = trp->list;
1561                 } while (trp != lrp);
1562
1563                 uio.uio_iov = niovp;
1564                 uio.uio_iovcnt = iovcnt;
1565                 uio.uio_segflg = UIO_SYSSPACE;
1566                 uio.uio_extflg = UIO_COPY_DEFAULT;
1567                 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1568                 uio.uio_resid = count;
1569                 /*
1570                  * The limit is checked on the client. We
1571                  * should allow any size writes here.
1572                  */
1573                 uio.uio_llimit = curproc->p_fsz_ctl;
1574                 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1575                 if (rlimit < (rlim64_t)uio.uio_resid)
1576                         uio.uio_resid = (uint_t)rlimit;
1577
1578                 /*
1579                  * For now we assume no append mode.
1580                  */
1581
1582                 /*
1583                  * We're changing creds because VM may fault
1584                  * and we need the cred of the current
1585                  * thread to be used if quota * checking is
1586                  * enabled.
1587                  */
1588                 savecred = curthread->t_cred;
1589                 curthread->t_cred = cr;
1590                 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1591                 curthread->t_cred = savecred;
1592
1593                 /* check if a monitor detected a delegation conflict */
1594                 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1595                         /* mark as wouldblock so response is dropped */
1596                         curthread->t_flag |= T_WOULDBLOCK;
1597
1598                 if (niovp != iov)
1599                         kmem_free(niovp, sizeof (*niovp) * iovcnt);
1600
1601                 if (!error) {
1602                         data_written = 1;
1603                         /*
1604                          * Get attributes again so we send the latest mod
1605                          * time to the client side for his cache.
1606                          */
1607                         va.va_mask = AT_ALL;    /* now we want everything */
1608
1609                         error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1610
1611                         if (!error)
1612                                 acl_perm(vp, exi, &va, rp->cr);
1613                 }
1614
1615                 /*
1616                  * Fill in the status responses for each request
1617                  * which was just handled.  Also, copy the latest
1618                  * attributes in to the attribute responses if
1619                  * appropriate.
1620                  */
1621                 t_flag = curthread->t_flag & T_WOULDBLOCK;
1622                 do {
1623                         rp->thread->t_flag |= t_flag;
1624                         /* check for overflows */
1625                         if (!error) {
1626                                 error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1627                         }
1628                         rp->ns->ns_status = puterrno(error);
1629                         rp = rp->list;
1630                 } while (rp != lrp);
1631         } while (rp != NULL);
1632
1633         /*
1634          * If any data was written at all, then we need to flush
1635          * the data and metadata to stable storage.
1636          */
1637         if (data_written) {
1638                 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1639
1640                 if (!error) {
1641                         error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1642                 }
1643         }
1644
1645         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1646
1647         if (in_crit)
1648                 nbl_end_crit(vp);
1649         VN_RELE(vp);
1650
1651         t_flag = curthread->t_flag & T_WOULDBLOCK;
1652         mutex_enter(&rfs_async_write_lock);
1653         for (rp = nlp->list; rp != NULL; rp = rp->list) {
1654                 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1655                         rp->ns->ns_status = puterrno(error);
1656                         rp->thread->t_flag |= t_flag;
1657                 }
1658         }
1659         cv_broadcast(&nlp->cv);
1660         mutex_exit(&rfs_async_write_lock);
1661
1662 }
1663
1664 void *
1665 rfs_write_getfh(struct nfswriteargs *wa)
1666 {
1667         return (&wa->wa_fhandle);
1668 }
1669
1670 /*
1671  * Create a file.
1672  * Creates a file with given attributes and returns those attributes
1673  * and an fhandle for the new file.
1674  */
1675 void
1676 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1677         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1678 {
1679         int error;
1680         int lookuperr;
1681         int in_crit = 0;
1682         struct vattr va;
1683         vnode_t *vp;
1684         vnode_t *realvp;
1685         vnode_t *dvp;
1686         char *name = args->ca_da.da_name;
1687         vnode_t *tvp = NULL;
1688         int mode;
1689         int lookup_ok;
1690         bool_t trunc;
1691         struct sockaddr *ca;
1692
1693         /*
1694          * Disallow NULL paths
1695          */
1696         if (name == NULL || *name == '\0') {
1697                 dr->dr_status = NFSERR_ACCES;
1698                 return;
1699         }
1700
1701         dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1702         if (dvp == NULL) {
1703                 dr->dr_status = NFSERR_STALE;
1704                 return;
1705         }
1706
1707         error = sattr_to_vattr(args->ca_sa, &va);
1708         if (error) {
1709                 dr->dr_status = puterrno(error);
1710                 return;
1711         }
1712
1713         /*
1714          * Must specify the mode.
1715          */
1716         if (!(va.va_mask & AT_MODE)) {
1717                 VN_RELE(dvp);
1718                 dr->dr_status = NFSERR_INVAL;
1719                 return;
1720         }
1721
1722         /*
1723          * This is a completely gross hack to make mknod
1724          * work over the wire until we can wack the protocol
1725          */
1726         if ((va.va_mode & IFMT) == IFCHR) {
1727                 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1728                         va.va_type = VFIFO;     /* xtra kludge for named pipe */
1729                 else {
1730                         va.va_type = VCHR;
1731                         /*
1732                          * uncompress the received dev_t
1733                          * if the top half is zero indicating a request
1734                          * from an `older style' OS.
1735                          */
1736                         if ((va.va_size & 0xffff0000) == 0)
1737                                 va.va_rdev = nfsv2_expdev(va.va_size);
1738                         else
1739                                 va.va_rdev = (dev_t)va.va_size;
1740                 }
1741                 va.va_mask &= ~AT_SIZE;
1742         } else if ((va.va_mode & IFMT) == IFBLK) {
1743                 va.va_type = VBLK;
1744                 /*
1745                  * uncompress the received dev_t
1746                  * if the top half is zero indicating a request
1747                  * from an `older style' OS.
1748                  */
1749                 if ((va.va_size & 0xffff0000) == 0)
1750                         va.va_rdev = nfsv2_expdev(va.va_size);
1751                 else
1752                         va.va_rdev = (dev_t)va.va_size;
1753                 va.va_mask &= ~AT_SIZE;
1754         } else if ((va.va_mode & IFMT) == IFSOCK) {
1755                 va.va_type = VSOCK;
1756         } else {
1757                 va.va_type = VREG;
1758         }
1759         va.va_mode &= ~IFMT;
1760         va.va_mask |= AT_TYPE;
1761
1762         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1763         name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1764             MAXPATHLEN);
1765         if (name == NULL) {
1766                 dr->dr_status = puterrno(EINVAL);
1767                 return;
1768         }
1769
1770         /*
1771          * Why was the choice made to use VWRITE as the mode to the
1772          * call to VOP_CREATE ? This results in a bug.  When a client
1773          * opens a file that already exists and is RDONLY, the second
1774          * open fails with an EACESS because of the mode.
1775          * bug ID 1054648.
1776          */
1777         lookup_ok = 0;
1778         mode = VWRITE;
1779         if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1780                 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1781                     NULL, NULL, NULL);
1782                 if (!error) {
1783                         struct vattr at;
1784
1785                         lookup_ok = 1;
1786                         at.va_mask = AT_MODE;
1787                         error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1788                         if (!error)
1789                                 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1790                         VN_RELE(tvp);
1791                         tvp = NULL;
1792                 }
1793         }
1794
1795         if (!lookup_ok) {
1796                 if (rdonly(exi, req)) {
1797                         error = EROFS;
1798                 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1799                     va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1800                         error = EPERM;
1801                 } else {
1802                         error = 0;
1803                 }
1804         }
1805
1806         /*
1807          * If file size is being modified on an already existing file
1808          * make sure that there are no conflicting non-blocking mandatory
1809          * locks in the region being manipulated. Return EACCES if there
1810          * are conflicting locks.
1811          */
1812         if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1813                 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1814                     NULL, NULL, NULL);
1815
1816                 if (!lookuperr &&
1817                     rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1818                         VN_RELE(tvp);
1819                         curthread->t_flag |= T_WOULDBLOCK;
1820                         goto out;
1821                 }
1822
1823                 if (!lookuperr && nbl_need_check(tvp)) {
1824                         /*
1825                          * The file exists. Now check if it has any
1826                          * conflicting non-blocking mandatory locks
1827                          * in the region being changed.
1828                          */
1829                         struct vattr bva;
1830                         u_offset_t offset;
1831                         ssize_t length;
1832
1833                         nbl_start_crit(tvp, RW_READER);
1834                         in_crit = 1;
1835
1836                         bva.va_mask = AT_SIZE;
1837                         error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1838                         if (!error) {
1839                                 if (va.va_size < bva.va_size) {
1840                                         offset = va.va_size;
1841                                         length = bva.va_size - va.va_size;
1842                                 } else {
1843                                         offset = bva.va_size;
1844                                         length = va.va_size - bva.va_size;
1845                                 }
1846                                 if (length) {
1847                                         if (nbl_conflict(tvp, NBL_WRITE,
1848                                             offset, length, 0, NULL)) {
1849                                                 error = EACCES;
1850                                         }
1851                                 }
1852                         }
1853                         if (error) {
1854                                 nbl_end_crit(tvp);
1855                                 VN_RELE(tvp);
1856                                 in_crit = 0;
1857                         }
1858                 } else if (tvp != NULL) {
1859                         VN_RELE(tvp);
1860                 }
1861         }
1862
1863         if (!error) {
1864                 /*
1865                  * If filesystem is shared with nosuid the remove any
1866                  * setuid/setgid bits on create.
1867                  */
1868                 if (va.va_type == VREG &&
1869                     exi->exi_export.ex_flags & EX_NOSUID)
1870                         va.va_mode &= ~(VSUID | VSGID);
1871
1872                 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1873                     NULL, NULL);
1874
1875                 if (!error) {
1876
1877                         if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1878                                 trunc = TRUE;
1879                         else
1880                                 trunc = FALSE;
1881
1882                         if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1883                                 VN_RELE(vp);
1884                                 curthread->t_flag |= T_WOULDBLOCK;
1885                                 goto out;
1886                         }
1887                         va.va_mask = AT_ALL;
1888
1889                         error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1890
1891                         /* check for overflows */
1892                         if (!error) {
1893                                 acl_perm(vp, exi, &va, cr);
1894                                 error = vattr_to_nattr(&va, &dr->dr_attr);
1895                                 if (!error) {
1896                                         error = makefh(&dr->dr_fhandle, vp,
1897                                             exi);
1898                                 }
1899                         }
1900                         /*
1901                          * Force modified metadata out to stable storage.
1902                          *
1903                          * if a underlying vp exists, pass it to VOP_FSYNC
1904                          */
1905                         if (VOP_REALVP(vp, &realvp, NULL) == 0)
1906                                 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1907                         else
1908                                 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1909                         VN_RELE(vp);
1910                 }
1911
1912                 if (in_crit) {
1913                         nbl_end_crit(tvp);
1914                         VN_RELE(tvp);
1915                 }
1916         }
1917
1918         /*
1919          * Force modified data and metadata out to stable storage.
1920          */
1921         (void) VOP_FSYNC(dvp, 0, cr, NULL);
1922
1923 out:
1924
1925         VN_RELE(dvp);
1926
1927         dr->dr_status = puterrno(error);
1928
1929         if (name != args->ca_da.da_name)
1930                 kmem_free(name, MAXPATHLEN);
1931 }
1932 void *
1933 rfs_create_getfh(struct nfscreatargs *args)
1934 {
1935         return (args->ca_da.da_fhandle);
1936 }
1937
1938 /*
1939  * Remove a file.
1940  * Remove named file from parent directory.
1941  */
1942 void
1943 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1944         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1945 {
1946         int error = 0;
1947         vnode_t *vp;
1948         vnode_t *targvp;
1949         int in_crit = 0;
1950
1951         /*
1952          * Disallow NULL paths
1953          */
1954         if (da->da_name == NULL || *da->da_name == '\0') {
1955                 *status = NFSERR_ACCES;
1956                 return;
1957         }
1958
1959         vp = nfs_fhtovp(da->da_fhandle, exi);
1960         if (vp == NULL) {
1961                 *status = NFSERR_STALE;
1962                 return;
1963         }
1964
1965         if (rdonly(exi, req)) {
1966                 VN_RELE(vp);
1967                 *status = NFSERR_ROFS;
1968                 return;
1969         }
1970
1971         /*
1972          * Check for a conflict with a non-blocking mandatory share reservation.
1973          */
1974         error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1975             NULL, cr, NULL, NULL, NULL);
1976         if (error != 0) {
1977                 VN_RELE(vp);
1978                 *status = puterrno(error);
1979                 return;
1980         }
1981
1982         /*
1983          * If the file is delegated to an v4 client, then initiate
1984          * recall and drop this request (by setting T_WOULDBLOCK).
1985          * The client will eventually re-transmit the request and
1986          * (hopefully), by then, the v4 client will have returned
1987          * the delegation.
1988          */
1989
1990         if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1991                 VN_RELE(vp);
1992                 VN_RELE(targvp);
1993                 curthread->t_flag |= T_WOULDBLOCK;
1994                 return;
1995         }
1996
1997         if (nbl_need_check(targvp)) {
1998                 nbl_start_crit(targvp, RW_READER);
1999                 in_crit = 1;
2000                 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2001                         error = EACCES;
2002                         goto out;
2003                 }
2004         }
2005
2006         error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2007
2008         /*
2009          * Force modified data and metadata out to stable storage.
2010          */
2011         (void) VOP_FSYNC(vp, 0, cr, NULL);
2012
2013 out:
2014         if (in_crit)
2015                 nbl_end_crit(targvp);
2016         VN_RELE(targvp);
2017         VN_RELE(vp);
2018
2019         *status = puterrno(error);
2020
2021 }
2022
2023 void *
2024 rfs_remove_getfh(struct nfsdiropargs *da)
2025 {
2026         return (da->da_fhandle);
2027 }
2028
2029 /*
2030  * rename a file
2031  * Give a file (from) a new name (to).
2032  */
2033 void
2034 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2035         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2036 {
2037         int error = 0;
2038         vnode_t *fromvp;
2039         vnode_t *tovp;
2040         struct exportinfo *to_exi;
2041         fhandle_t *fh;
2042         vnode_t *srcvp;
2043         vnode_t *targvp;
2044         int in_crit = 0;
2045
2046         fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2047         if (fromvp == NULL) {
2048                 *status = NFSERR_STALE;
2049                 return;
2050         }
2051
2052         fh = args->rna_to.da_fhandle;
2053         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2054         if (to_exi == NULL) {
2055                 VN_RELE(fromvp);
2056                 *status = NFSERR_ACCES;
2057                 return;
2058         }
2059         exi_rele(to_exi);
2060
2061         if (to_exi != exi) {
2062                 VN_RELE(fromvp);
2063                 *status = NFSERR_XDEV;
2064                 return;
2065         }
2066
2067         tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2068         if (tovp == NULL) {
2069                 VN_RELE(fromvp);
2070                 *status = NFSERR_STALE;
2071                 return;
2072         }
2073
2074         if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2075                 VN_RELE(tovp);
2076                 VN_RELE(fromvp);
2077                 *status = NFSERR_NOTDIR;
2078                 return;
2079         }
2080
2081         /*
2082          * Disallow NULL paths
2083          */
2084         if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2085             args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2086                 VN_RELE(tovp);
2087                 VN_RELE(fromvp);
2088                 *status = NFSERR_ACCES;
2089                 return;
2090         }
2091
2092         if (rdonly(exi, req)) {
2093                 VN_RELE(tovp);
2094                 VN_RELE(fromvp);
2095                 *status = NFSERR_ROFS;
2096                 return;
2097         }
2098
2099         /*
2100          * Check for a conflict with a non-blocking mandatory share reservation.
2101          */
2102         error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2103             NULL, cr, NULL, NULL, NULL);
2104         if (error != 0) {
2105                 VN_RELE(tovp);
2106                 VN_RELE(fromvp);
2107                 *status = puterrno(error);
2108                 return;
2109         }
2110
2111         /* Check for delegations on the source file */
2112
2113         if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2114                 VN_RELE(tovp);
2115                 VN_RELE(fromvp);
2116                 VN_RELE(srcvp);
2117                 curthread->t_flag |= T_WOULDBLOCK;
2118                 return;
2119         }
2120
2121         /* Check for delegation on the file being renamed over, if it exists */
2122
2123         if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2124             VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2125             NULL, NULL, NULL) == 0) {
2126
2127                 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2128                         VN_RELE(tovp);
2129                         VN_RELE(fromvp);
2130                         VN_RELE(srcvp);
2131                         VN_RELE(targvp);
2132                         curthread->t_flag |= T_WOULDBLOCK;
2133                         return;
2134                 }
2135                 VN_RELE(targvp);
2136         }
2137
2138
2139         if (nbl_need_check(srcvp)) {
2140                 nbl_start_crit(srcvp, RW_READER);
2141                 in_crit = 1;
2142                 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2143                         error = EACCES;
2144                         goto out;
2145                 }
2146         }
2147
2148         error = VOP_RENAME(fromvp, args->rna_from.da_name,
2149             tovp, args->rna_to.da_name, cr, NULL, 0);
2150
2151         if (error == 0)
2152                 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2153                     strlen(args->rna_to.da_name));
2154
2155         /*
2156          * Force modified data and metadata out to stable storage.
2157          */
2158         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2159         (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2160
2161 out:
2162         if (in_crit)
2163                 nbl_end_crit(srcvp);
2164         VN_RELE(srcvp);
2165         VN_RELE(tovp);
2166         VN_RELE(fromvp);
2167
2168         *status = puterrno(error);
2169
2170 }
2171 void *
2172 rfs_rename_getfh(struct nfsrnmargs *args)
2173 {
2174         return (args->rna_from.da_fhandle);
2175 }
2176
2177 /*
2178  * Link to a file.
2179  * Create a file (to) which is a hard link to the given file (from).
2180  */
2181 void
2182 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2183         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2184 {
2185         int error;
2186         vnode_t *fromvp;
2187         vnode_t *tovp;
2188         struct exportinfo *to_exi;
2189         fhandle_t *fh;
2190
2191         fromvp = nfs_fhtovp(args->la_from, exi);
2192         if (fromvp == NULL) {
2193                 *status = NFSERR_STALE;
2194                 return;
2195         }
2196
2197         fh = args->la_to.da_fhandle;
2198         to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2199         if (to_exi == NULL) {
2200                 VN_RELE(fromvp);
2201                 *status = NFSERR_ACCES;
2202                 return;
2203         }
2204         exi_rele(to_exi);
2205
2206         if (to_exi != exi) {
2207                 VN_RELE(fromvp);
2208                 *status = NFSERR_XDEV;
2209                 return;
2210         }
2211
2212         tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2213         if (tovp == NULL) {
2214                 VN_RELE(fromvp);
2215                 *status = NFSERR_STALE;
2216                 return;
2217         }
2218
2219         if (tovp->v_type != VDIR) {
2220                 VN_RELE(tovp);
2221                 VN_RELE(fromvp);
2222                 *status = NFSERR_NOTDIR;
2223                 return;
2224         }
2225         /*
2226          * Disallow NULL paths
2227          */
2228         if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2229                 VN_RELE(tovp);
2230                 VN_RELE(fromvp);
2231                 *status = NFSERR_ACCES;
2232                 return;
2233         }
2234
2235         if (rdonly(exi, req)) {
2236                 VN_RELE(tovp);
2237                 VN_RELE(fromvp);
2238                 *status = NFSERR_ROFS;
2239                 return;
2240         }
2241
2242         error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2243
2244         /*
2245          * Force modified data and metadata out to stable storage.
2246          */
2247         (void) VOP_FSYNC(tovp, 0, cr, NULL);
2248         (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2249
2250         VN_RELE(tovp);
2251         VN_RELE(fromvp);
2252
2253         *status = puterrno(error);
2254
2255 }
2256 void *
2257 rfs_link_getfh(struct nfslinkargs *args)
2258 {
2259         return (args->la_from);
2260 }
2261
2262 /*
2263  * Symbolicly link to a file.
2264  * Create a file (to) with the given attributes which is a symbolic link
2265  * to the given path name (to).
2266  */
2267 void
2268 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2269         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2270 {
2271         int error;
2272         struct vattr va;
2273         vnode_t *vp;
2274         vnode_t *svp;
2275         int lerror;
2276         struct sockaddr *ca;
2277         char *name = NULL;
2278
2279         /*
2280          * Disallow NULL paths
2281          */
2282         if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2283                 *status = NFSERR_ACCES;
2284                 return;
2285         }
2286
2287         vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2288         if (vp == NULL) {
2289                 *status = NFSERR_STALE;
2290                 return;
2291         }
2292
2293         if (rdonly(exi, req)) {
2294                 VN_RELE(vp);
2295                 *status = NFSERR_ROFS;
2296                 return;
2297         }
2298
2299         error = sattr_to_vattr(args->sla_sa, &va);
2300         if (error) {
2301                 VN_RELE(vp);
2302                 *status = puterrno(error);
2303                 return;
2304         }
2305
2306         if (!(va.va_mask & AT_MODE)) {
2307                 VN_RELE(vp);
2308                 *status = NFSERR_INVAL;
2309                 return;
2310         }
2311
2312         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2313         name = nfscmd_convname(ca, exi, args->sla_tnm,
2314             NFSCMD_CONV_INBOUND, MAXPATHLEN);
2315
2316         if (name == NULL) {
2317                 *status = NFSERR_ACCES;
2318                 return;
2319         }
2320
2321         va.va_type = VLNK;
2322         va.va_mask |= AT_TYPE;
2323
2324         error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2325
2326         /*
2327          * Force new data and metadata out to stable storage.
2328          */
2329         lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2330             NULL, cr, NULL, NULL, NULL);
2331
2332         if (!lerror) {
2333                 (void) VOP_FSYNC(svp, 0, cr, NULL);
2334                 VN_RELE(svp);
2335         }
2336
2337         /*
2338          * Force modified data and metadata out to stable storage.
2339          */
2340         (void) VOP_FSYNC(vp, 0, cr, NULL);
2341
2342         VN_RELE(vp);
2343
2344         *status = puterrno(error);
2345         if (name != args->sla_tnm)
2346                 kmem_free(name, MAXPATHLEN);
2347
2348 }
2349 void *
2350 rfs_symlink_getfh(struct nfsslargs *args)
2351 {
2352         return (args->sla_from.da_fhandle);
2353 }
2354
2355 /*
2356  * Make a directory.
2357  * Create a directory with the given name, parent directory, and attributes.
2358  * Returns a file handle and attributes for the new directory.
2359  */
2360 void
2361 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2362         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2363 {
2364         int error;
2365         struct vattr va;
2366         vnode_t *dvp = NULL;
2367         vnode_t *vp;
2368         char *name = args->ca_da.da_name;
2369
2370         /*
2371          * Disallow NULL paths
2372          */
2373         if (name == NULL || *name == '\0') {
2374                 dr->dr_status = NFSERR_ACCES;
2375                 return;
2376         }
2377
2378         vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2379         if (vp == NULL) {
2380                 dr->dr_status = NFSERR_STALE;
2381                 return;
2382         }
2383
2384         if (rdonly(exi, req)) {
2385                 VN_RELE(vp);
2386                 dr->dr_status = NFSERR_ROFS;
2387                 return;
2388         }
2389
2390         error = sattr_to_vattr(args->ca_sa, &va);
2391         if (error) {
2392                 VN_RELE(vp);
2393                 dr->dr_status = puterrno(error);
2394                 return;
2395         }
2396
2397         if (!(va.va_mask & AT_MODE)) {
2398                 VN_RELE(vp);
2399                 dr->dr_status = NFSERR_INVAL;
2400                 return;
2401         }
2402
2403         va.va_type = VDIR;
2404         va.va_mask |= AT_TYPE;
2405
2406         error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2407
2408         if (!error) {
2409                 /*
2410                  * Attribtutes of the newly created directory should
2411                  * be returned to the client.
2412                  */
2413                 va.va_mask = AT_ALL; /* We want everything */
2414                 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2415
2416                 /* check for overflows */
2417                 if (!error) {
2418                         acl_perm(vp, exi, &va, cr);
2419                         error = vattr_to_nattr(&va, &dr->dr_attr);
2420                         if (!error) {
2421                                 error = makefh(&dr->dr_fhandle, dvp, exi);
2422                         }
2423                 }
2424                 /*
2425                  * Force new data and metadata out to stable storage.
2426                  */
2427                 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2428                 VN_RELE(dvp);
2429         }
2430
2431         /*
2432          * Force modified data and metadata out to stable storage.
2433          */
2434         (void) VOP_FSYNC(vp, 0, cr, NULL);
2435
2436         VN_RELE(vp);
2437
2438         dr->dr_status = puterrno(error);
2439
2440 }
2441 void *
2442 rfs_mkdir_getfh(struct nfscreatargs *args)
2443 {
2444         return (args->ca_da.da_fhandle);
2445 }
2446
2447 /*
2448  * Remove a directory.
2449  * Remove the given directory name from the given parent directory.
2450  */
2451 void
2452 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2453         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2454 {
2455         int error;
2456         vnode_t *vp;
2457
2458
2459         /*
2460          * Disallow NULL paths
2461          */
2462         if (da->da_name == NULL || *da->da_name == '\0') {
2463                 *status = NFSERR_ACCES;
2464                 return;
2465         }
2466
2467         vp = nfs_fhtovp(da->da_fhandle, exi);
2468         if (vp == NULL) {
2469                 *status = NFSERR_STALE;
2470                 return;
2471         }
2472
2473         if (rdonly(exi, req)) {
2474                 VN_RELE(vp);
2475                 *status = NFSERR_ROFS;
2476                 return;
2477         }
2478
2479         /*
2480          * VOP_RMDIR now takes a new third argument (the current
2481          * directory of the process).  That's because someone
2482          * wants to return EINVAL if one tries to remove ".".
2483          * Of course, NFS servers have no idea what their
2484          * clients' current directories are.  We fake it by
2485          * supplying a vnode known to exist and illegal to
2486          * remove.
2487          */
2488         error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2489
2490         /*
2491          * Force modified data and metadata out to stable storage.
2492          */
2493         (void) VOP_FSYNC(vp, 0, cr, NULL);
2494
2495         VN_RELE(vp);
2496
2497         /*
2498          * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2499          * if the directory is not empty.  A System V NFS server
2500          * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2501          * over the wire.
2502          */
2503         if (error == EEXIST)
2504                 *status = NFSERR_NOTEMPTY;
2505         else
2506                 *status = puterrno(error);
2507
2508 }
2509 void *
2510 rfs_rmdir_getfh(struct nfsdiropargs *da)
2511 {
2512         return (da->da_fhandle);
2513 }
2514
2515 /* ARGSUSED */
2516 void
2517 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2518         struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2519 {
2520         int error;
2521         int iseof;
2522         struct iovec iov;
2523         struct uio uio;
2524         vnode_t *vp;
2525         char *ndata = NULL;
2526         struct sockaddr *ca;
2527         size_t nents;
2528         int ret;
2529
2530         vp = nfs_fhtovp(&rda->rda_fh, exi);
2531         if (vp == NULL) {
2532                 rd->rd_entries = NULL;
2533                 rd->rd_status = NFSERR_STALE;
2534                 return;
2535         }
2536
2537         if (vp->v_type != VDIR) {
2538                 VN_RELE(vp);
2539                 rd->rd_entries = NULL;
2540                 rd->rd_status = NFSERR_NOTDIR;
2541                 return;
2542         }
2543
2544         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2545
2546         error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2547
2548         if (error) {
2549                 rd->rd_entries = NULL;
2550                 goto bad;
2551         }
2552
2553         if (rda->rda_count == 0) {
2554                 rd->rd_entries = NULL;
2555                 rd->rd_size = 0;
2556                 rd->rd_eof = FALSE;
2557                 goto bad;
2558         }
2559
2560         rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2561
2562         /*
2563          * Allocate data for entries.  This will be freed by rfs_rddirfree.
2564          */
2565         rd->rd_bufsize = (uint_t)rda->rda_count;
2566         rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2567
2568         /*
2569          * Set up io vector to read directory data
2570          */
2571         iov.iov_base = (caddr_t)rd->rd_entries;
2572         iov.iov_len = rda->rda_count;
2573         uio.uio_iov = &iov;
2574         uio.uio_iovcnt = 1;
2575         uio.uio_segflg = UIO_SYSSPACE;
2576         uio.uio_extflg = UIO_COPY_CACHED;
2577         uio.uio_loffset = (offset_t)rda->rda_offset;
2578         uio.uio_resid = rda->rda_count;
2579
2580         /*
2581          * read directory
2582          */
2583         error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2584
2585         /*
2586          * Clean up
2587          */
2588         if (!error) {
2589                 /*
2590                  * set size and eof
2591                  */
2592                 if (uio.uio_resid == rda->rda_count) {
2593                         rd->rd_size = 0;
2594                         rd->rd_eof = TRUE;
2595                 } else {
2596                         rd->rd_size = (uint32_t)(rda->rda_count -
2597                             uio.uio_resid);
2598                         rd->rd_eof = iseof ? TRUE : FALSE;
2599                 }
2600         }
2601
2602         ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2603         nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2604         ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2605             rda->rda_count, &ndata);
2606
2607         if (ret != 0) {
2608                 size_t dropbytes;
2609                 /*
2610                  * We had to drop one or more entries in order to fit
2611                  * during the character conversion.  We need to patch
2612                  * up the size and eof info.
2613                  */
2614                 if (rd->rd_eof)
2615                         rd->rd_eof = FALSE;
2616                 dropbytes = nfscmd_dropped_entrysize(
2617                     (struct dirent64 *)rd->rd_entries, nents, ret);
2618                 rd->rd_size -= dropbytes;
2619         }
2620         if (ndata == NULL) {
2621                 ndata = (char *)rd->rd_entries;
2622         } else if (ndata != (char *)rd->rd_entries) {
2623                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2624                 rd->rd_entries = (void *)ndata;
2625                 rd->rd_bufsize = rda->rda_count;
2626         }
2627
2628 bad:
2629         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2630
2631 #if 0 /* notyet */
2632         /*
2633          * Don't do this.  It causes local disk writes when just
2634          * reading the file and the overhead is deemed larger
2635          * than the benefit.
2636          */
2637         /*
2638          * Force modified metadata out to stable storage.
2639          */
2640         (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2641 #endif
2642
2643         VN_RELE(vp);
2644
2645         rd->rd_status = puterrno(error);
2646
2647 }
2648 void *
2649 rfs_readdir_getfh(struct nfsrddirargs *rda)
2650 {
2651         return (&rda->rda_fh);
2652 }
2653 void
2654 rfs_rddirfree(struct nfsrddirres *rd)
2655 {
2656         if (rd->rd_entries != NULL)
2657                 kmem_free(rd->rd_entries, rd->rd_bufsize);
2658 }
2659
2660 /* ARGSUSED */
2661 void
2662 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2663         struct svc_req *req, cred_t *cr)
2664 {
2665         int error;
2666         struct statvfs64 sb;
2667         vnode_t *vp;
2668
2669         vp = nfs_fhtovp(fh, exi);
2670         if (vp == NULL) {
2671                 fs->fs_status = NFSERR_STALE;
2672                 return;
2673         }
2674
2675         error = VFS_STATVFS(vp->v_vfsp, &sb);
2676
2677         if (!error) {
2678                 fs->fs_tsize = nfstsize();
2679                 fs->fs_bsize = sb.f_frsize;
2680                 fs->fs_blocks = sb.f_blocks;
2681                 fs->fs_bfree = sb.f_bfree;
2682                 fs->fs_bavail = sb.f_bavail;
2683         }
2684
2685         VN_RELE(vp);
2686
2687         fs->fs_status = puterrno(error);
2688
2689 }
2690 void *
2691 rfs_statfs_getfh(fhandle_t *fh)
2692 {
2693         return (fh);
2694 }
2695
2696 static int
2697 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2698 {
2699         vap->va_mask = 0;
2700
2701         /*
2702          * There was a sign extension bug in some VFS based systems
2703          * which stored the mode as a short.  When it would get
2704          * assigned to a u_long, no sign extension would occur.
2705          * It needed to, but this wasn't noticed because sa_mode
2706          * would then get assigned back to the short, thus ignoring
2707          * the upper 16 bits of sa_mode.
2708          *
2709          * To make this implementation work for both broken
2710          * clients and good clients, we check for both versions
2711          * of the mode.
2712          */
2713         if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2714             sa->sa_mode != (uint32_t)-1) {
2715                 vap->va_mask |= AT_MODE;
2716                 vap->va_mode = sa->sa_mode;
2717         }
2718         if (sa->sa_uid != (uint32_t)-1) {
2719                 vap->va_mask |= AT_UID;
2720                 vap->va_uid = sa->sa_uid;
2721         }
2722         if (sa->sa_gid != (uint32_t)-1) {
2723                 vap->va_mask |= AT_GID;
2724                 vap->va_gid = sa->sa_gid;
2725         }
2726         if (sa->sa_size != (uint32_t)-1) {
2727                 vap->va_mask |= AT_SIZE;
2728                 vap->va_size = sa->sa_size;
2729         }
2730         if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2731             sa->sa_atime.tv_usec != (int32_t)-1) {
2732 #ifndef _LP64
2733                 /* return error if time overflow */
2734                 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2735                         return (EOVERFLOW);
2736 #endif
2737                 vap->va_mask |= AT_ATIME;
2738                 /*
2739                  * nfs protocol defines times as unsigned so don't extend sign,
2740                  * unless sysadmin set nfs_allow_preepoch_time.
2741                  */
2742                 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2743                 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2744         }
2745         if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2746             sa->sa_mtime.tv_usec != (int32_t)-1) {
2747 #ifndef _LP64
2748                 /* return error if time overflow */
2749                 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2750                         return (EOVERFLOW);
2751 #endif
2752                 vap->va_mask |= AT_MTIME;
2753                 /*
2754                  * nfs protocol defines times as unsigned so don't extend sign,
2755                  * unless sysadmin set nfs_allow_preepoch_time.
2756                  */
2757                 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2758                 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2759         }
2760         return (0);
2761 }
2762
2763 static enum nfsftype vt_to_nf[] = {
2764         0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2765 };
2766
2767 /*
2768  * check the following fields for overflow: nodeid, size, and time.
2769  * There could be a problem when converting 64-bit LP64 fields
2770  * into 32-bit ones.  Return an error if there is an overflow.
2771  */
2772 int
2773 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2774 {
2775         ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2776         na->na_type = vt_to_nf[vap->va_type];
2777
2778         if (vap->va_mode == (unsigned short) -1)
2779                 na->na_mode = (uint32_t)-1;
2780         else
2781                 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2782
2783         if (vap->va_uid == (unsigned short)(-1))
2784                 na->na_uid = (uint32_t)(-1);
2785         else if (vap->va_uid == UID_NOBODY)
2786                 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2787         else
2788                 na->na_uid = vap->va_uid;
2789
2790         if (vap->va_gid == (unsigned short)(-1))
2791                 na->na_gid = (uint32_t)-1;
2792         else if (vap->va_gid == GID_NOBODY)
2793                 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2794         else
2795                 na->na_gid = vap->va_gid;
2796
2797         /*
2798          * Do we need to check fsid for overflow?  It is 64-bit in the
2799          * vattr, but are bigger than 32 bit values supported?
2800          */
2801         na->na_fsid = vap->va_fsid;
2802
2803         na->na_nodeid = vap->va_nodeid;
2804
2805         /*
2806          * Check to make sure that the nodeid is representable over the
2807          * wire without losing bits.
2808          */
2809         if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2810                 return (EFBIG);
2811         na->na_nlink = vap->va_nlink;
2812
2813         /*
2814          * Check for big files here, instead of at the caller.  See
2815          * comments in cstat for large special file explanation.
2816          */
2817         if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2818                 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2819                         return (EFBIG);
2820                 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2821                         /* UNKNOWN_SIZE | OVERFLOW */
2822                         na->na_size = MAXOFF32_T;
2823                 } else
2824                         na->na_size = vap->va_size;
2825         } else
2826                 na->na_size = vap->va_size;
2827
2828         /*
2829          * If the vnode times overflow the 32-bit times that NFS2
2830          * uses on the wire then return an error.
2831          */
2832         if (!NFS_VAP_TIME_OK(vap)) {
2833                 return (EOVERFLOW);
2834         }
2835         na->na_atime.tv_sec = vap->va_atime.tv_sec;
2836         na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2837
2838         na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2839         na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2840
2841         na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2842         na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2843
2844         /*
2845          * If the dev_t will fit into 16 bits then compress
2846          * it, otherwise leave it alone. See comments in
2847          * nfs_client.c.
2848          */
2849         if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2850             getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2851                 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2852         else
2853                 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2854
2855         na->na_blocks = vap->va_nblocks;
2856         na->na_blocksize = vap->va_blksize;
2857
2858         /*
2859          * This bit of ugliness is a *TEMPORARY* hack to preserve the
2860          * over-the-wire protocols for named-pipe vnodes.  It remaps the
2861          * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2862          *
2863          * BUYER BEWARE:
2864          *  If you are porting the NFS to a non-Sun server, you probably
2865          *  don't want to include the following block of code.  The
2866          *  over-the-wire special file types will be changing with the
2867          *  NFS Protocol Revision.
2868          */
2869         if (vap->va_type == VFIFO)
2870                 NA_SETFIFO(na);
2871         return (0);
2872 }
2873
2874 /*
2875  * acl v2 support: returns approximate permission.
2876  *      default: returns minimal permission (more restrictive)
2877  *      aclok: returns maximal permission (less restrictive)
2878  *      This routine changes the permissions that are alaredy in *va.
2879  *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2880  *      CLASS_OBJ is always the same as GROUP_OBJ entry.
2881  */
2882 static void
2883 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2884 {
2885         vsecattr_t      vsa;
2886         int             aclcnt;
2887         aclent_t        *aclentp;
2888         mode_t          mask_perm;
2889         mode_t          grp_perm;
2890         mode_t          other_perm;
2891         mode_t          other_orig;
2892         int             error;
2893
2894         /* dont care default acl */
2895         vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2896         error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2897
2898         if (!error) {
2899                 aclcnt = vsa.vsa_aclcnt;
2900                 if (aclcnt > MIN_ACL_ENTRIES) {
2901                         /* non-trivial ACL */
2902                         aclentp = vsa.vsa_aclentp;
2903                         if (exi->exi_export.ex_flags & EX_ACLOK) {
2904                                 /* maximal permissions */
2905                                 grp_perm = 0;
2906                                 other_perm = 0;
2907                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2908                                         switch (aclentp->a_type) {
2909                                         case USER_OBJ:
2910                                                 break;
2911                                         case USER:
2912                                                 grp_perm |=
2913                                                     aclentp->a_perm << 3;
2914                                                 other_perm |= aclentp->a_perm;
2915                                                 break;
2916                                         case GROUP_OBJ:
2917                                                 grp_perm |=
2918                                                     aclentp->a_perm << 3;
2919                                                 break;
2920                                         case GROUP:
2921                                                 other_perm |= aclentp->a_perm;
2922                                                 break;
2923                                         case OTHER_OBJ:
2924                                                 other_orig = aclentp->a_perm;
2925                                                 break;
2926                                         case CLASS_OBJ:
2927                                                 mask_perm = aclentp->a_perm;
2928                                                 break;
2929                                         default:
2930                                                 break;
2931                                         }
2932                                 }
2933                                 grp_perm &= mask_perm << 3;
2934                                 other_perm &= mask_perm;
2935                                 other_perm |= other_orig;
2936
2937                         } else {
2938                                 /* minimal permissions */
2939                                 grp_perm = 070;
2940                                 other_perm = 07;
2941                                 for (; aclcnt > 0; aclcnt--, aclentp++) {
2942                                         switch (aclentp->a_type) {
2943                                         case USER_OBJ:
2944                                                 break;
2945                                         case USER:
2946                                         case CLASS_OBJ:
2947                                                 grp_perm &=
2948                                                     aclentp->a_perm << 3;
2949                                                 other_perm &=
2950                                                     aclentp->a_perm;
2951                                                 break;
2952                                         case GROUP_OBJ:
2953                                                 grp_perm &=
2954                                                     aclentp->a_perm << 3;
2955                                                 break;
2956                                         case GROUP:
2957                                                 other_perm &=
2958                                                     aclentp->a_perm;
2959                                                 break;
2960                                         case OTHER_OBJ:
2961                                                 other_perm &=
2962                                                     aclentp->a_perm;
2963                                                 break;
2964                                         default:
2965                                                 break;
2966                                         }
2967                                 }
2968                         }
2969                         /* copy to va */
2970                         va->va_mode &= ~077;
2971                         va->va_mode |= grp_perm | other_perm;
2972                 }
2973                 if (vsa.vsa_aclcnt)
2974                         kmem_free(vsa.vsa_aclentp,
2975                             vsa.vsa_aclcnt * sizeof (aclent_t));
2976         }
2977 }
2978
2979 void
2980 rfs_srvrinit(void)
2981 {
2982         mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2983         nfs2_srv_caller_id = fs_new_caller_id();
2984 }
2985
2986 void
2987 rfs_srvrfini(void)
2988 {
2989         mutex_destroy(&rfs_async_write_lock);
2990 }
2991
2992 static int
2993 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2994 {
2995         struct clist    *wcl;
2996         int             wlist_len;
2997         uint32_t        count = rr->rr_count;
2998
2999         wcl = ra->ra_wlist;
3000
3001         if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3002                 return (FALSE);
3003         }
3004
3005         wcl = ra->ra_wlist;
3006         rr->rr_ok.rrok_wlist_len = wlist_len;
3007         rr->rr_ok.rrok_wlist = wcl;
3008
3009         return (TRUE);
3010 }