kernel/fs/nfs/nfs4_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  25  */
  26
  27 /*
  28  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  29  *      All Rights Reserved
  30  */
  31
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/cred.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/pathname.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/kmem.h>
  41 #include <sys/mkdev.h>
  42 #include <sys/mount.h>
  43 #include <sys/statvfs.h>
  44 #include <sys/errno.h>
  45 #include <sys/debug.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/utsname.h>
  48 #include <sys/bootconf.h>
  49 #include <sys/modctl.h>
  50 #include <sys/acl.h>
  51 #include <sys/flock.h>
  52 #include <sys/time.h>
  53 #include <sys/disp.h>
  54 #include <sys/policy.h>
  55 #include <sys/socket.h>
  56 #include <sys/netconfig.h>
  57 #include <sys/dnlc.h>
  58 #include <sys/list.h>
  59 #include <sys/mntent.h>
  60
  61 #include <rpc/types.h>
  62 #include <rpc/auth.h>
  63 #include <rpc/rpcsec_gss.h>
  64 #include <rpc/clnt.h>
  65
  66 #include <nfs/nfs.h>
  67 #include <nfs/nfs_clnt.h>
  68 #include <nfs/mount.h>
  69 #include <nfs/nfs_acl.h>
  70
  71 #include <sys/fs_subr.h>
  72
  73 #include <nfs/nfs4.h>
  74 #include <nfs/rnode4.h>
  75 #include <nfs/nfs4_clnt.h>
  76 #include <sys/fs/autofs.h>
  77
  78 #include <sys/sdt.h>
  79
  80
  81 /*
  82  * Arguments passed to thread to free data structures from forced unmount.
  83  */
  84
  85 typedef struct {
  86         vfs_t   *fm_vfsp;
  87         int     fm_flag;
  88         cred_t  *fm_cr;
  89 } freemountargs_t;
  90
  91 static void     async_free_mount(vfs_t *, int, cred_t *);
  92 static void     nfs4_free_mount(vfs_t *, int, cred_t *);
  93 static void     nfs4_free_mount_thread(freemountargs_t *);
  94 static int nfs4_chkdup_servinfo4(servinfo4_t *, servinfo4_t *);
  95
  96 /*
  97  * From rpcsec module (common/rpcsec).
  98  */
  99 extern int sec_clnt_loadinfo(struct sec_data *, struct sec_data **, model_t);
 100 extern void sec_clnt_freeinfo(struct sec_data *);
 101
 102 /*
 103  * The order and contents of this structure must be kept in sync with that of
 104  * rfsreqcnt_v4_tmpl in nfs_stats.c
 105  */
 106 static char *rfsnames_v4[] = {
 107         "null", "compound", "reserved", "access", "close", "commit", "create",
 108         "delegpurge", "delegreturn", "getattr", "getfh", "link", "lock",
 109         "lockt", "locku", "lookup", "lookupp", "nverify", "open", "openattr",
 110         "open_confirm", "open_downgrade", "putfh", "putpubfh", "putrootfh",
 111         "read", "readdir", "readlink", "remove", "rename", "renew",
 112         "restorefh", "savefh", "secinfo", "setattr", "setclientid",
 113         "setclientid_confirm", "verify", "write"
 114 };
 115
 116 /*
 117  * nfs4_max_mount_retry is the number of times the client will redrive
 118  * a mount compound before giving up and returning failure.  The intent
 119  * is to redrive mount compounds which fail NFS4ERR_STALE so that
 120  * if a component of the server path being mounted goes stale, it can
 121  * "recover" by redriving the mount compund (LOOKUP ops).  This recovery
 122  * code is needed outside of the recovery framework because mount is a
 123  * special case.  The client doesn't create vnodes/rnodes for components
 124  * of the server path being mounted.  The recovery code recovers real
 125  * client objects, not STALE FHs which map to components of the server
 126  * path being mounted.
 127  *
 128  * We could just fail the mount on the first time, but that would
 129  * instantly trigger failover (from nfs4_mount), and the client should
 130  * try to re-lookup the STALE FH before doing failover.  The easiest
 131  * way to "re-lookup" is to simply redrive the mount compound.
 132  */
 133 static int nfs4_max_mount_retry = 2;
 134
 135 /*
 136  * nfs4 vfs operations.
 137  */
 138 int             nfs4_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
 139 static int      nfs4_unmount(vfs_t *, int, cred_t *);
 140 static int      nfs4_root(vfs_t *, vnode_t **);
 141 static int      nfs4_statvfs(vfs_t *, struct statvfs64 *);
 142 static int      nfs4_sync(vfs_t *, short, cred_t *);
 143 static int      nfs4_vget(vfs_t *, vnode_t **, fid_t *);
 144 static int      nfs4_mountroot(vfs_t *, whymountroot_t);
 145 static void     nfs4_freevfs(vfs_t *);
 146
 147 static int      nfs4rootvp(vnode_t **, vfs_t *, struct servinfo4 *,
 148                     int, cred_t *, zone_t *);
 149
 150
 151 int nfs4_vfsinit(void);
 152 void nfs4_vfsfini(void);
 153 static void nfs4setclientid_init(void);
 154 static void nfs4setclientid_fini(void);
 155 static void nfs4setclientid_otw(mntinfo4_t *, servinfo4_t *,  cred_t *,
 156                 struct nfs4_server *, nfs4_error_t *, int *);
 157 static void     destroy_nfs4_server(nfs4_server_t *);
 158 static void     remove_mi(nfs4_server_t *, mntinfo4_t *);
 159
 160 extern void nfs4_ephemeral_init(void);
 161 extern void nfs4_ephemeral_fini(void);
 162
 163 /* referral related routines */
 164 static servinfo4_t *copy_svp(servinfo4_t *);
 165 static void free_knconf_contents(struct knetconfig *k);
 166 static char *extract_referral_point(const char *, int);
 167 static void setup_newsvpath(servinfo4_t *, int);
 168 static void update_servinfo4(servinfo4_t *, fs_location4 *,
 169                 struct nfs_fsl_info *, char *, int);
 170
 171 /*
 172  * Initialize the vfs structure
 173  */
 174
 175 static int nfs4fstyp;
 176
 177
 178 /*
 179  * Debug variable to check for rdma based
 180  * transport startup and cleanup. Controlled
 181  * through /etc/system. Off by default.
 182  */
 183 extern int rdma_debug;
 184
 185 const struct vfsops nfs4_vfsops = {
 186         .vfs_mount = nfs4_mount,
 187         .vfs_unmount = nfs4_unmount,
 188         .vfs_root = nfs4_root,
 189         .vfs_statvfs = nfs4_statvfs,
 190         .vfs_sync = nfs4_sync,
 191         .vfs_vget = nfs4_vget,
 192         .vfs_mountroot = nfs4_mountroot,
 193         .vfs_freevfs = nfs4_freevfs,
 194 };
 195
 196 int
 197 nfs4init(int fstyp, char *name)
 198 {
 199         int error;
 200
 201         error = vfs_setfsops(fstyp, &nfs4_vfsops);
 202         if (error != 0) {
 203                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 204                     "nfs4init: bad fstyp");
 205                 goto out;
 206         }
 207
 208         nfs4fstyp = fstyp;
 209         (void) nfs4_vfsinit();
 210         (void) nfs4_init_dot_entries();
 211
 212 out:
 213         if (error)
 214                 (void) vfs_freevfsops_by_type(fstyp);
 215
 216         return (error);
 217 }
 218
 219 void
 220 nfs4fini(void)
 221 {
 222         (void) nfs4_destroy_dot_entries();
 223         nfs4_vfsfini();
 224 }
 225
 226 /*
 227  * Create a new sec_data structure to store AUTH_DH related data:
 228  * netname, syncaddr, knetconfig. There is no AUTH_F_RPCTIMESYNC
 229  * flag set for NFS V4 since we are avoiding to contact the rpcbind
 230  * daemon and is using the IP time service (IPPORT_TIMESERVER).
 231  *
 232  * sec_data can be freed by sec_clnt_freeinfo().
 233  */
 234 static struct sec_data *
 235 create_authdh_data(char *netname, int nlen, struct netbuf *syncaddr,
 236     struct knetconfig *knconf)
 237 {
 238         struct sec_data *secdata;
 239         dh_k4_clntdata_t *data;
 240         char *pf, *p;
 241
 242         if (syncaddr == NULL || syncaddr->buf == NULL || nlen == 0)
 243                 return (NULL);
 244
 245         secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
 246         secdata->flags = 0;
 247
 248         data = kmem_alloc(sizeof (*data), KM_SLEEP);
 249
 250         data->syncaddr.maxlen = syncaddr->maxlen;
 251         data->syncaddr.len = syncaddr->len;
 252         data->syncaddr.buf = kmem_alloc(syncaddr->len, KM_SLEEP);
 253         bcopy(syncaddr->buf, data->syncaddr.buf, syncaddr->len);
 254
 255         /*
 256          * duplicate the knconf information for the
 257          * new opaque data.
 258          */
 259         data->knconf = kmem_alloc(sizeof (*knconf), KM_SLEEP);
 260         *data->knconf = *knconf;
 261         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 262         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 263         bcopy(knconf->knc_protofmly, pf, KNC_STRSIZE);
 264         bcopy(knconf->knc_proto, p, KNC_STRSIZE);
 265         data->knconf->knc_protofmly = pf;
 266         data->knconf->knc_proto = p;
 267
 268         /* move server netname to the sec_data structure */
 269         data->netname = kmem_alloc(nlen, KM_SLEEP);
 270         bcopy(netname, data->netname, nlen);
 271         data->netnamelen = (int)nlen;
 272
 273         secdata->secmod = AUTH_DH;
 274         secdata->rpcflavor = AUTH_DH;
 275         secdata->data = (caddr_t)data;
 276
 277         return (secdata);
 278 }
 279
 280 /*
 281  * Returns (deep) copy of sec_data_t. Allocates all memory required; caller
 282  * is responsible for freeing.
 283  */
 284 sec_data_t *
 285 copy_sec_data(sec_data_t *fsecdata)
 286 {
 287         sec_data_t *tsecdata;
 288
 289         if (fsecdata == NULL)
 290                 return (NULL);
 291
 292         if (fsecdata->rpcflavor == AUTH_DH) {
 293                 dh_k4_clntdata_t *fdata = (dh_k4_clntdata_t *)fsecdata->data;
 294
 295                 if (fdata == NULL)
 296                         return (NULL);
 297
 298                 tsecdata = (sec_data_t *)create_authdh_data(fdata->netname,
 299                     fdata->netnamelen, &fdata->syncaddr, fdata->knconf);
 300
 301                 return (tsecdata);
 302         }
 303
 304         tsecdata = kmem_zalloc(sizeof (sec_data_t), KM_SLEEP);
 305
 306         tsecdata->secmod = fsecdata->secmod;
 307         tsecdata->rpcflavor = fsecdata->rpcflavor;
 308         tsecdata->flags = fsecdata->flags;
 309         tsecdata->uid = fsecdata->uid;
 310
 311         if (fsecdata->rpcflavor == RPCSEC_GSS) {
 312                 gss_clntdata_t *gcd = (gss_clntdata_t *)fsecdata->data;
 313
 314                 tsecdata->data = (caddr_t)copy_sec_data_gss(gcd);
 315         } else {
 316                 tsecdata->data = NULL;
 317         }
 318
 319         return (tsecdata);
 320 }
 321
 322 gss_clntdata_t *
 323 copy_sec_data_gss(gss_clntdata_t *fdata)
 324 {
 325         gss_clntdata_t *tdata;
 326
 327         if (fdata == NULL)
 328                 return (NULL);
 329
 330         tdata = kmem_zalloc(sizeof (gss_clntdata_t), KM_SLEEP);
 331
 332         tdata->mechanism.length = fdata->mechanism.length;
 333         tdata->mechanism.elements = kmem_zalloc(fdata->mechanism.length,
 334             KM_SLEEP);
 335         bcopy(fdata->mechanism.elements, tdata->mechanism.elements,
 336             fdata->mechanism.length);
 337
 338         tdata->service = fdata->service;
 339
 340         (void) strcpy(tdata->uname, fdata->uname);
 341         (void) strcpy(tdata->inst, fdata->inst);
 342         (void) strcpy(tdata->realm, fdata->realm);
 343
 344         tdata->qop = fdata->qop;
 345
 346         return (tdata);
 347 }
 348
 349 static int
 350 nfs4_chkdup_servinfo4(servinfo4_t *svp_head, servinfo4_t *svp)
 351 {
 352         servinfo4_t *si;
 353
 354         /*
 355          * Iterate over the servinfo4 list to make sure
 356          * we do not have a duplicate. Skip any servinfo4
 357          * that has been marked "NOT IN USE"
 358          */
 359         for (si = svp_head; si; si = si->sv_next) {
 360                 (void) nfs_rw_enter_sig(&si->sv_lock, RW_READER, 0);
 361                 if (si->sv_flags & SV4_NOTINUSE) {
 362                         nfs_rw_exit(&si->sv_lock);
 363                         continue;
 364                 }
 365                 nfs_rw_exit(&si->sv_lock);
 366                 if (si == svp)
 367                         continue;
 368                 if (si->sv_addr.len == svp->sv_addr.len &&
 369                     strcmp(si->sv_knconf->knc_protofmly,
 370                     svp->sv_knconf->knc_protofmly) == 0 &&
 371                     bcmp(si->sv_addr.buf, svp->sv_addr.buf,
 372                     si->sv_addr.len) == 0) {
 373                         /* it's a duplicate */
 374                         return (1);
 375                 }
 376         }
 377         /* it's not a duplicate */
 378         return (0);
 379 }
 380
 381 void
 382 nfs4_free_args(struct nfs_args *nargs)
 383 {
 384         if (nargs->knconf) {
 385                 if (nargs->knconf->knc_protofmly)
 386                         kmem_free(nargs->knconf->knc_protofmly,
 387                             KNC_STRSIZE);
 388                 if (nargs->knconf->knc_proto)
 389                         kmem_free(nargs->knconf->knc_proto, KNC_STRSIZE);
 390                 kmem_free(nargs->knconf, sizeof (*nargs->knconf));
 391                 nargs->knconf = NULL;
 392         }
 393
 394         if (nargs->fh) {
 395                 kmem_free(nargs->fh, strlen(nargs->fh) + 1);
 396                 nargs->fh = NULL;
 397         }
 398
 399         if (nargs->hostname) {
 400                 kmem_free(nargs->hostname, strlen(nargs->hostname) + 1);
 401                 nargs->hostname = NULL;
 402         }
 403
 404         if (nargs->addr) {
 405                 if (nargs->addr->buf) {
 406                         ASSERT(nargs->addr->len);
 407                         kmem_free(nargs->addr->buf, nargs->addr->len);
 408                 }
 409                 kmem_free(nargs->addr, sizeof (struct netbuf));
 410                 nargs->addr = NULL;
 411         }
 412
 413         if (nargs->syncaddr) {
 414                 ASSERT(nargs->syncaddr->len);
 415                 if (nargs->syncaddr->buf) {
 416                         ASSERT(nargs->syncaddr->len);
 417                         kmem_free(nargs->syncaddr->buf, nargs->syncaddr->len);
 418                 }
 419                 kmem_free(nargs->syncaddr, sizeof (struct netbuf));
 420                 nargs->syncaddr = NULL;
 421         }
 422
 423         if (nargs->netname) {
 424                 kmem_free(nargs->netname, strlen(nargs->netname) + 1);
 425                 nargs->netname = NULL;
 426         }
 427
 428         if (nargs->nfs_ext_u.nfs_extA.secdata) {
 429                 sec_clnt_freeinfo(
 430                     nargs->nfs_ext_u.nfs_extA.secdata);
 431                 nargs->nfs_ext_u.nfs_extA.secdata = NULL;
 432         }
 433 }
 434
 435
 436 int
 437 nfs4_copyin(char *data, int datalen, struct nfs_args *nargs)
 438 {
 439
 440         int error;
 441         size_t hlen;                    /* length of hostname */
 442         size_t nlen;                    /* length of netname */
 443         char netname[MAXNETNAMELEN+1];  /* server's netname */
 444         struct netbuf addr;             /* server's address */
 445         struct netbuf syncaddr;         /* AUTH_DES time sync addr */
 446         struct knetconfig *knconf;              /* transport structure */
 447         struct sec_data *secdata = NULL;        /* security data */
 448         STRUCT_DECL(nfs_args, args);            /* nfs mount arguments */
 449         STRUCT_DECL(knetconfig, knconf_tmp);
 450         STRUCT_DECL(netbuf, addr_tmp);
 451         int flags;
 452         char *p, *pf;
 453         struct pathname pn;
 454         char *userbufptr;
 455
 456
 457         bzero(nargs, sizeof (*nargs));
 458
 459         STRUCT_INIT(args, get_udatamodel());
 460         bzero(STRUCT_BUF(args), SIZEOF_STRUCT(nfs_args, DATAMODEL_NATIVE));
 461         if (copyin(data, STRUCT_BUF(args), MIN(datalen,
 462             STRUCT_SIZE(args))))
 463                 return (EFAULT);
 464
 465         nargs->wsize = STRUCT_FGET(args, wsize);
 466         nargs->rsize = STRUCT_FGET(args, rsize);
 467         nargs->timeo = STRUCT_FGET(args, timeo);
 468         nargs->retrans = STRUCT_FGET(args, retrans);
 469         nargs->acregmin = STRUCT_FGET(args, acregmin);
 470         nargs->acregmax = STRUCT_FGET(args, acregmax);
 471         nargs->acdirmin = STRUCT_FGET(args, acdirmin);
 472         nargs->acdirmax = STRUCT_FGET(args, acdirmax);
 473
 474         flags = STRUCT_FGET(args, flags);
 475         nargs->flags = flags;
 476
 477         addr.buf = NULL;
 478         syncaddr.buf = NULL;
 479
 480
 481         /*
 482          * Allocate space for a knetconfig structure and
 483          * its strings and copy in from user-land.
 484          */
 485         knconf = kmem_zalloc(sizeof (*knconf), KM_SLEEP);
 486         STRUCT_INIT(knconf_tmp, get_udatamodel());
 487         if (copyin(STRUCT_FGETP(args, knconf), STRUCT_BUF(knconf_tmp),
 488             STRUCT_SIZE(knconf_tmp))) {
 489                 kmem_free(knconf, sizeof (*knconf));
 490                 return (EFAULT);
 491         }
 492
 493         knconf->knc_semantics = STRUCT_FGET(knconf_tmp, knc_semantics);
 494         knconf->knc_protofmly = STRUCT_FGETP(knconf_tmp, knc_protofmly);
 495         knconf->knc_proto = STRUCT_FGETP(knconf_tmp, knc_proto);
 496         if (get_udatamodel() != DATAMODEL_LP64) {
 497                 knconf->knc_rdev = expldev(STRUCT_FGET(knconf_tmp, knc_rdev));
 498         } else {
 499                 knconf->knc_rdev = STRUCT_FGET(knconf_tmp, knc_rdev);
 500         }
 501
 502         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 503         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 504         error = copyinstr(knconf->knc_protofmly, pf, KNC_STRSIZE, NULL);
 505         if (error) {
 506                 kmem_free(pf, KNC_STRSIZE);
 507                 kmem_free(p, KNC_STRSIZE);
 508                 kmem_free(knconf, sizeof (*knconf));
 509                 return (error);
 510         }
 511
 512         error = copyinstr(knconf->knc_proto, p, KNC_STRSIZE, NULL);
 513         if (error) {
 514                 kmem_free(pf, KNC_STRSIZE);
 515                 kmem_free(p, KNC_STRSIZE);
 516                 kmem_free(knconf, sizeof (*knconf));
 517                 return (error);
 518         }
 519
 520
 521         knconf->knc_protofmly = pf;
 522         knconf->knc_proto = p;
 523
 524         nargs->knconf = knconf;
 525
 526         /*
 527          * Get server address
 528          */
 529         STRUCT_INIT(addr_tmp, get_udatamodel());
 530         if (copyin(STRUCT_FGETP(args, addr), STRUCT_BUF(addr_tmp),
 531             STRUCT_SIZE(addr_tmp))) {
 532                 error = EFAULT;
 533                 goto errout;
 534         }
 535
 536         nargs->addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
 537         userbufptr = STRUCT_FGETP(addr_tmp, buf);
 538         addr.len = STRUCT_FGET(addr_tmp, len);
 539         addr.buf = kmem_alloc(addr.len, KM_SLEEP);
 540         addr.maxlen = addr.len;
 541         if (copyin(userbufptr, addr.buf, addr.len)) {
 542                 kmem_free(addr.buf, addr.len);
 543                 error = EFAULT;
 544                 goto errout;
 545         }
 546         bcopy(&addr, nargs->addr, sizeof (struct netbuf));
 547
 548         /*
 549          * Get the root fhandle
 550          */
 551         error = pn_get(STRUCT_FGETP(args, fh), UIO_USERSPACE, &pn);
 552         if (error)
 553                 goto errout;
 554
 555         /* Volatile fh: keep server paths, so use actual-size strings */
 556         nargs->fh = kmem_alloc(pn.pn_pathlen + 1, KM_SLEEP);
 557         bcopy(pn.pn_path, nargs->fh, pn.pn_pathlen);
 558         nargs->fh[pn.pn_pathlen] = '\0';
 559         pn_free(&pn);
 560
 561
 562         /*
 563          * Get server's hostname
 564          */
 565         if (flags & NFSMNT_HOSTNAME) {
 566                 error = copyinstr(STRUCT_FGETP(args, hostname),
 567                     netname, sizeof (netname), &hlen);
 568                 if (error)
 569                         goto errout;
 570                 nargs->hostname = kmem_zalloc(hlen, KM_SLEEP);
 571                 (void) strcpy(nargs->hostname, netname);
 572
 573         } else {
 574                 nargs->hostname = NULL;
 575         }
 576
 577
 578         /*
 579          * If there are syncaddr and netname data, load them in. This is
 580          * to support data needed for NFSV4 when AUTH_DH is the negotiated
 581          * flavor via SECINFO. (instead of using MOUNT protocol in V3).
 582          */
 583         netname[0] = '\0';
 584         if (flags & NFSMNT_SECURE) {
 585
 586                 /* get syncaddr */
 587                 STRUCT_INIT(addr_tmp, get_udatamodel());
 588                 if (copyin(STRUCT_FGETP(args, syncaddr), STRUCT_BUF(addr_tmp),
 589                     STRUCT_SIZE(addr_tmp))) {
 590                         error = EINVAL;
 591                         goto errout;
 592                 }
 593                 userbufptr = STRUCT_FGETP(addr_tmp, buf);
 594                 syncaddr.len = STRUCT_FGET(addr_tmp, len);
 595                 syncaddr.buf = kmem_alloc(syncaddr.len, KM_SLEEP);
 596                 syncaddr.maxlen = syncaddr.len;
 597                 if (copyin(userbufptr, syncaddr.buf, syncaddr.len)) {
 598                         kmem_free(syncaddr.buf, syncaddr.len);
 599                         error = EFAULT;
 600                         goto errout;
 601                 }
 602
 603                 nargs->syncaddr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
 604                 bcopy(&syncaddr, nargs->syncaddr, sizeof (struct netbuf));
 605
 606                 /* get server's netname */
 607                 if (copyinstr(STRUCT_FGETP(args, netname), netname,
 608                     sizeof (netname), &nlen)) {
 609                         error = EFAULT;
 610                         goto errout;
 611                 }
 612
 613                 netname[nlen] = '\0';
 614                 nargs->netname = kmem_zalloc(nlen, KM_SLEEP);
 615                 (void) strcpy(nargs->netname, netname);
 616         }
 617
 618         /*
 619          * Get the extention data which has the security data structure.
 620          * This includes data for AUTH_SYS as well.
 621          */
 622         if (flags & NFSMNT_NEWARGS) {
 623                 nargs->nfs_args_ext = STRUCT_FGET(args, nfs_args_ext);
 624                 if (nargs->nfs_args_ext == NFS_ARGS_EXTA ||
 625                     nargs->nfs_args_ext == NFS_ARGS_EXTB) {
 626                         /*
 627                          * Indicating the application is using the new
 628                          * sec_data structure to pass in the security
 629                          * data.
 630                          */
 631                         if (STRUCT_FGETP(args,
 632                             nfs_ext_u.nfs_extA.secdata) != NULL) {
 633                                 error = sec_clnt_loadinfo(
 634                                     (struct sec_data *)STRUCT_FGETP(args,
 635                                     nfs_ext_u.nfs_extA.secdata),
 636                                     &secdata, get_udatamodel());
 637                         }
 638                         nargs->nfs_ext_u.nfs_extA.secdata = secdata;
 639                 }
 640         }
 641
 642         if (error)
 643                 goto errout;
 644
 645         /*
 646          * Failover support:
 647          *
 648          * We may have a linked list of nfs_args structures,
 649          * which means the user is looking for failover.  If
 650          * the mount is either not "read-only" or "soft",
 651          * we want to bail out with EINVAL.
 652          */
 653         if (nargs->nfs_args_ext == NFS_ARGS_EXTB)
 654                 nargs->nfs_ext_u.nfs_extB.next =
 655                     STRUCT_FGETP(args, nfs_ext_u.nfs_extB.next);
 656
 657 errout:
 658         if (error)
 659                 nfs4_free_args(nargs);
 660
 661         return (error);
 662 }
 663
 664
 665 /*
 666  * nfs mount vfsop
 667  * Set up mount info record and attach it to vfs struct.
 668  */
 669 int
 670 nfs4_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 671 {
 672         char *data = uap->dataptr;
 673         int error;
 674         vnode_t *rtvp;                  /* the server's root */
 675         mntinfo4_t *mi;                 /* mount info, pointed at by vfs */
 676         struct knetconfig *rdma_knconf; /* rdma transport structure */
 677         rnode4_t *rp;
 678         struct servinfo4 *svp;          /* nfs server info */
 679         struct servinfo4 *svp_tail = NULL; /* previous nfs server info */
 680         struct servinfo4 *svp_head;     /* first nfs server info */
 681         struct servinfo4 *svp_2ndlast;  /* 2nd last in server info list */
 682         struct sec_data *secdata;       /* security data */
 683         struct nfs_args *args = NULL;
 684         int flags, addr_type, removed;
 685         zone_t *zone = nfs_zone();
 686         nfs4_error_t n4e;
 687         zone_t *mntzone = NULL;
 688
 689         if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
 690                 return (EPERM);
 691         if (mvp->v_type != VDIR)
 692                 return (ENOTDIR);
 693
 694         /*
 695          * get arguments
 696          *
 697          * nfs_args is now versioned and is extensible, so
 698          * uap->datalen might be different from sizeof (args)
 699          * in a compatible situation.
 700          */
 701 more:
 702         if (!(uap->flags & MS_SYSSPACE)) {
 703                 if (args == NULL)
 704                         args = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
 705                 else
 706                         nfs4_free_args(args);
 707                 error = nfs4_copyin(data, uap->datalen, args);
 708                 if (error) {
 709                         if (args) {
 710                                 kmem_free(args, sizeof (*args));
 711                         }
 712                         return (error);
 713                 }
 714         } else {
 715                 args = (struct nfs_args *)data;
 716         }
 717
 718         flags = args->flags;
 719
 720         /*
 721          * If the request changes the locking type, disallow the remount,
 722          * because it's questionable whether we can transfer the
 723          * locking state correctly.
 724          */
 725         if (uap->flags & MS_REMOUNT) {
 726                 if (!(uap->flags & MS_SYSSPACE)) {
 727                         nfs4_free_args(args);
 728                         kmem_free(args, sizeof (*args));
 729                 }
 730                 if ((mi = VFTOMI4(vfsp)) != NULL) {
 731                         uint_t new_mi_llock;
 732                         uint_t old_mi_llock;
 733                         new_mi_llock = (flags & NFSMNT_LLOCK) ? 1 : 0;
 734                         old_mi_llock = (mi->mi_flags & MI4_LLOCK) ? 1 : 0;
 735                         if (old_mi_llock != new_mi_llock)
 736                                 return (EBUSY);
 737                 }
 738                 return (0);
 739         }
 740
 741         /*
 742          * For ephemeral mount trigger stub vnodes, we have two problems
 743          * to solve: racing threads will likely fail the v_count check, and
 744          * we want only one to proceed with the mount.
 745          *
 746          * For stubs, if the mount has already occurred (via a racing thread),
 747          * just return success. If not, skip the v_count check and proceed.
 748          * Note that we are already serialised at this point.
 749          */
 750         mutex_enter(&mvp->v_lock);
 751         if (vn_matchops(mvp, &nfs4_trigger_vnodeops)) {
 752                 /* mntpt is a v4 stub vnode */
 753                 ASSERT(RP_ISSTUB(VTOR4(mvp)));
 754                 ASSERT(!(uap->flags & MS_OVERLAY));
 755                 ASSERT(!(mvp->v_flag & VROOT));
 756                 if (vn_mountedvfs(mvp) != NULL) {
 757                         /* ephemeral mount has already occurred */
 758                         ASSERT(uap->flags & MS_SYSSPACE);
 759                         mutex_exit(&mvp->v_lock);
 760                         return (0);
 761                 }
 762         } else {
 763                 /* mntpt is a non-v4 or v4 non-stub vnode */
 764                 if (!(uap->flags & MS_OVERLAY) &&
 765                     (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 766                         mutex_exit(&mvp->v_lock);
 767                         if (!(uap->flags & MS_SYSSPACE)) {
 768                                 nfs4_free_args(args);
 769                                 kmem_free(args, sizeof (*args));
 770                         }
 771                         return (EBUSY);
 772                 }
 773         }
 774         mutex_exit(&mvp->v_lock);
 775
 776         /* make sure things are zeroed for errout: */
 777         rtvp = NULL;
 778         mi = NULL;
 779         secdata = NULL;
 780
 781         /*
 782          * A valid knetconfig structure is required.
 783          */
 784         if (!(flags & NFSMNT_KNCONF) ||
 785             args->knconf == NULL || args->knconf->knc_protofmly == NULL ||
 786             args->knconf->knc_proto == NULL ||
 787             (strcmp(args->knconf->knc_proto, NC_UDP) == 0)) {
 788                 if (!(uap->flags & MS_SYSSPACE)) {
 789                         nfs4_free_args(args);
 790                         kmem_free(args, sizeof (*args));
 791                 }
 792                 return (EINVAL);
 793         }
 794
 795         if ((strlen(args->knconf->knc_protofmly) >= KNC_STRSIZE) ||
 796             (strlen(args->knconf->knc_proto) >= KNC_STRSIZE)) {
 797                 if (!(uap->flags & MS_SYSSPACE)) {
 798                         nfs4_free_args(args);
 799                         kmem_free(args, sizeof (*args));
 800                 }
 801                 return (EINVAL);
 802         }
 803
 804         /*
 805          * Allocate a servinfo4 struct.
 806          */
 807         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
 808         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
 809         if (svp_tail) {
 810                 svp_2ndlast = svp_tail;
 811                 svp_tail->sv_next = svp;
 812         } else {
 813                 svp_head = svp;
 814                 svp_2ndlast = svp;
 815         }
 816
 817         svp_tail = svp;
 818         svp->sv_knconf = args->knconf;
 819         args->knconf = NULL;
 820
 821         /*
 822          * Get server address
 823          */
 824         if (args->addr == NULL || args->addr->buf == NULL) {
 825                 error = EINVAL;
 826                 goto errout;
 827         }
 828
 829         svp->sv_addr.maxlen = args->addr->maxlen;
 830         svp->sv_addr.len = args->addr->len;
 831         svp->sv_addr.buf = args->addr->buf;
 832         args->addr->buf = NULL;
 833
 834         /*
 835          * Get the root fhandle
 836          */
 837         if (args->fh == NULL || (strlen(args->fh) >= MAXPATHLEN)) {
 838                 error = EINVAL;
 839                 goto errout;
 840         }
 841
 842         svp->sv_path = args->fh;
 843         svp->sv_pathlen = strlen(args->fh) + 1;
 844         args->fh = NULL;
 845
 846         /*
 847          * Get server's hostname
 848          */
 849         if (flags & NFSMNT_HOSTNAME) {
 850                 if (args->hostname == NULL || (strlen(args->hostname) >
 851                     MAXNETNAMELEN)) {
 852                         error = EINVAL;
 853                         goto errout;
 854                 }
 855                 svp->sv_hostnamelen = strlen(args->hostname) + 1;
 856                 svp->sv_hostname = args->hostname;
 857                 args->hostname = NULL;
 858         } else {
 859                 char *p = "unknown-host";
 860                 svp->sv_hostnamelen = strlen(p) + 1;
 861                 svp->sv_hostname = kmem_zalloc(svp->sv_hostnamelen, KM_SLEEP);
 862                 (void) strcpy(svp->sv_hostname, p);
 863         }
 864
 865         /*
 866          * RDMA MOUNT SUPPORT FOR NFS v4.
 867          * Establish, is it possible to use RDMA, if so overload the
 868          * knconf with rdma specific knconf and free the orignal knconf.
 869          */
 870         if ((flags & NFSMNT_TRYRDMA) || (flags & NFSMNT_DORDMA)) {
 871                 /*
 872                  * Determine the addr type for RDMA, IPv4 or v6.
 873                  */
 874                 if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET) == 0)
 875                         addr_type = AF_INET;
 876                 else if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET6) == 0)
 877                         addr_type = AF_INET6;
 878
 879                 if (rdma_reachable(addr_type, &svp->sv_addr,
 880                     &rdma_knconf) == 0) {
 881                         /*
 882                          * If successful, hijack the orignal knconf and
 883                          * replace with the new one, depending on the flags.
 884                          */
 885                         svp->sv_origknconf = svp->sv_knconf;
 886                         svp->sv_knconf = rdma_knconf;
 887                 } else {
 888                         if (flags & NFSMNT_TRYRDMA) {
 889 #ifdef  DEBUG
 890                                 if (rdma_debug)
 891                                         zcmn_err(getzoneid(), CE_WARN,
 892                                             "no RDMA onboard, revert\n");
 893 #endif
 894                         }
 895
 896                         if (flags & NFSMNT_DORDMA) {
 897                                 /*
 898                                  * If proto=rdma is specified and no RDMA
 899                                  * path to this server is avialable then
 900                                  * ditch this server.
 901                                  * This is not included in the mountable
 902                                  * server list or the replica list.
 903                                  * Check if more servers are specified;
 904                                  * Failover case, otherwise bail out of mount.
 905                                  */
 906                                 if (args->nfs_args_ext == NFS_ARGS_EXTB &&
 907                                     args->nfs_ext_u.nfs_extB.next != NULL) {
 908                                         data = (char *)
 909                                             args->nfs_ext_u.nfs_extB.next;
 910                                         if (uap->flags & MS_RDONLY &&
 911                                             !(flags & NFSMNT_SOFT)) {
 912                                                 if (svp_head->sv_next == NULL) {
 913                                                         svp_tail = NULL;
 914                                                         svp_2ndlast = NULL;
 915                                                         sv4_free(svp_head);
 916                                                         goto more;
 917                                                 } else {
 918                                                         svp_tail = svp_2ndlast;
 919                                                         svp_2ndlast->sv_next =
 920                                                             NULL;
 921                                                         sv4_free(svp);
 922                                                         goto more;
 923                                                 }
 924                                         }
 925                                 } else {
 926                                         /*
 927                                          * This is the last server specified
 928                                          * in the nfs_args list passed down
 929                                          * and its not rdma capable.
 930                                          */
 931                                         if (svp_head->sv_next == NULL) {
 932                                                 /*
 933                                                  * Is this the only one
 934                                                  */
 935                                                 error = EINVAL;
 936 #ifdef  DEBUG
 937                                                 if (rdma_debug)
 938                                                         zcmn_err(getzoneid(),
 939                                                             CE_WARN,
 940                                                             "No RDMA srv");
 941 #endif
 942                                                 goto errout;
 943                                         } else {
 944                                                 /*
 945                                                  * There is list, since some
 946                                                  * servers specified before
 947                                                  * this passed all requirements
 948                                                  */
 949                                                 svp_tail = svp_2ndlast;
 950                                                 svp_2ndlast->sv_next = NULL;
 951                                                 sv4_free(svp);
 952                                                 goto proceed;
 953                                         }
 954                                 }
 955                         }
 956                 }
 957         }
 958
 959         /*
 960          * If there are syncaddr and netname data, load them in. This is
 961          * to support data needed for NFSV4 when AUTH_DH is the negotiated
 962          * flavor via SECINFO. (instead of using MOUNT protocol in V3).
 963          */
 964         if (args->flags & NFSMNT_SECURE) {
 965                 svp->sv_dhsec = create_authdh_data(args->netname,
 966                     strlen(args->netname),
 967                     args->syncaddr, svp->sv_knconf);
 968         }
 969
 970         /*
 971          * Get the extention data which has the security data structure.
 972          * This includes data for AUTH_SYS as well.
 973          */
 974         if (flags & NFSMNT_NEWARGS) {
 975                 switch (args->nfs_args_ext) {
 976                 case NFS_ARGS_EXTA:
 977                 case NFS_ARGS_EXTB:
 978                         /*
 979                          * Indicating the application is using the new
 980                          * sec_data structure to pass in the security
 981                          * data.
 982                          */
 983                         secdata = args->nfs_ext_u.nfs_extA.secdata;
 984                         if (secdata == NULL) {
 985                                 error = EINVAL;
 986                         } else if (uap->flags & MS_SYSSPACE) {
 987                                 /*
 988                                  * Need to validate the flavor here if
 989                                  * sysspace, userspace was already
 990                                  * validate from the nfs_copyin function.
 991                                  */
 992                                 switch (secdata->rpcflavor) {
 993                                 case AUTH_NONE:
 994                                 case AUTH_UNIX:
 995                                 case AUTH_LOOPBACK:
 996                                 case AUTH_DES:
 997                                 case RPCSEC_GSS:
 998                                         break;
 999                                 default:
1000                                         error = EINVAL;
1001                                         goto errout;
1002                                 }
1003                         }
1004                         args->nfs_ext_u.nfs_extA.secdata = NULL;
1005                         break;
1006
1007                 default:
1008                         error = EINVAL;
1009                         break;
1010                 }
1011
1012         } else if (flags & NFSMNT_SECURE) {
1013                 /*
1014                  * NFSMNT_SECURE is deprecated but we keep it
1015                  * to support the rogue user-generated application
1016                  * that may use this undocumented interface to do
1017                  * AUTH_DH security, e.g. our own rexd.
1018                  *
1019                  * Also note that NFSMNT_SECURE is used for passing
1020                  * AUTH_DH info to be used in negotiation.
1021                  */
1022                 secdata = create_authdh_data(args->netname,
1023                     strlen(args->netname), args->syncaddr, svp->sv_knconf);
1024
1025         } else {
1026                 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1027                 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
1028                 secdata->data = NULL;
1029         }
1030
1031         svp->sv_secdata = secdata;
1032
1033         /*
1034          * User does not explictly specify a flavor, and a user
1035          * defined default flavor is passed down.
1036          */
1037         if (flags & NFSMNT_SECDEFAULT) {
1038                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1039                 svp->sv_flags |= SV4_TRYSECDEFAULT;
1040                 nfs_rw_exit(&svp->sv_lock);
1041         }
1042
1043         /*
1044          * Failover support:
1045          *
1046          * We may have a linked list of nfs_args structures,
1047          * which means the user is looking for failover.  If
1048          * the mount is either not "read-only" or "soft",
1049          * we want to bail out with EINVAL.
1050          */
1051         if (args->nfs_args_ext == NFS_ARGS_EXTB &&
1052             args->nfs_ext_u.nfs_extB.next != NULL) {
1053                 if (uap->flags & MS_RDONLY && !(flags & NFSMNT_SOFT)) {
1054                         data = (char *)args->nfs_ext_u.nfs_extB.next;
1055                         goto more;
1056                 }
1057                 error = EINVAL;
1058                 goto errout;
1059         }
1060
1061         /*
1062          * Determine the zone we're being mounted into.
1063          */
1064         zone_hold(mntzone = zone);              /* start with this assumption */
1065         if (getzoneid() == GLOBAL_ZONEID) {
1066                 zone_rele(mntzone);
1067                 mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
1068                 ASSERT(mntzone != NULL);
1069                 if (mntzone != zone) {
1070                         error = EBUSY;
1071                         goto errout;
1072                 }
1073         }
1074
1075         /*
1076          * Stop the mount from going any further if the zone is going away.
1077          */
1078         if (zone_status_get(mntzone) >= ZONE_IS_SHUTTING_DOWN) {
1079                 error = EBUSY;
1080                 goto errout;
1081         }
1082
1083         /*
1084          * Get root vnode.
1085          */
1086 proceed:
1087         error = nfs4rootvp(&rtvp, vfsp, svp_head, flags, cr, mntzone);
1088         if (error) {
1089                 /* if nfs4rootvp failed, it will free svp_head */
1090                 svp_head = NULL;
1091                 goto errout;
1092         }
1093
1094         mi = VTOMI4(rtvp);
1095
1096         /*
1097          * Send client id to the server, if necessary
1098          */
1099         nfs4_error_zinit(&n4e);
1100         nfs4setclientid(mi, cr, FALSE, &n4e);
1101
1102         error = n4e.error;
1103
1104         if (error)
1105                 goto errout;
1106
1107         /*
1108          * Set option fields in the mount info record
1109          */
1110
1111         if (svp_head->sv_next) {
1112                 mutex_enter(&mi->mi_lock);
1113                 mi->mi_flags |= MI4_LLOCK;
1114                 mutex_exit(&mi->mi_lock);
1115         }
1116         error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, args);
1117         if (error)
1118                 goto errout;
1119
1120         /*
1121          * Time to tie in the mirror mount info at last!
1122          */
1123         if (flags & NFSMNT_EPHEMERAL)
1124                 error = nfs4_record_ephemeral_mount(mi, mvp);
1125
1126 errout:
1127         if (error) {
1128                 if (rtvp != NULL) {
1129                         rp = VTOR4(rtvp);
1130                         if (rp->r_flags & R4HASHED)
1131                                 rp4_rmhash(rp);
1132                 }
1133                 if (mi != NULL) {
1134                         nfs4_async_stop(vfsp);
1135                         nfs4_async_manager_stop(vfsp);
1136                         nfs4_remove_mi_from_server(mi, NULL);
1137                         if (rtvp != NULL)
1138                                 VN_RELE(rtvp);
1139                         if (mntzone != NULL)
1140                                 zone_rele(mntzone);
1141                         /* need to remove it from the zone */
1142                         removed = nfs4_mi_zonelist_remove(mi);
1143                         if (removed)
1144                                 zone_rele_ref(&mi->mi_zone_ref,
1145                                     ZONE_REF_NFSV4);
1146                         MI4_RELE(mi);
1147                         if (!(uap->flags & MS_SYSSPACE) && args) {
1148                                 nfs4_free_args(args);
1149                                 kmem_free(args, sizeof (*args));
1150                         }
1151                         return (error);
1152                 }
1153                 if (svp_head)
1154                         sv4_free(svp_head);
1155         }
1156
1157         if (!(uap->flags & MS_SYSSPACE) && args) {
1158                 nfs4_free_args(args);
1159                 kmem_free(args, sizeof (*args));
1160         }
1161         if (rtvp != NULL)
1162                 VN_RELE(rtvp);
1163
1164         if (mntzone != NULL)
1165                 zone_rele(mntzone);
1166
1167         return (error);
1168 }
1169
1170 #ifdef  DEBUG
1171 #define VERS_MSG        "NFS4 server "
1172 #else
1173 #define VERS_MSG        "NFS server "
1174 #endif
1175
1176 #define READ_MSG        \
1177         VERS_MSG "%s returned 0 for read transfer size"
1178 #define WRITE_MSG       \
1179         VERS_MSG "%s returned 0 for write transfer size"
1180 #define SIZE_MSG        \
1181         VERS_MSG "%s returned 0 for maximum file size"
1182
1183 /*
1184  * Get the symbolic link text from the server for a given filehandle
1185  * of that symlink.
1186  *
1187  *      (get symlink text) PUTFH READLINK
1188  */
1189 static int
1190 getlinktext_otw(mntinfo4_t *mi, nfs_fh4 *fh, char **linktextp, cred_t *cr,
1191     int flags)
1192 {
1193         COMPOUND4args_clnt args;
1194         COMPOUND4res_clnt res;
1195         int doqueue;
1196         nfs_argop4 argop[2];
1197         nfs_resop4 *resop;
1198         READLINK4res *lr_res;
1199         uint_t len;
1200         bool_t needrecov = FALSE;
1201         nfs4_recov_state_t recov_state;
1202         nfs4_sharedfh_t *sfh;
1203         nfs4_error_t e;
1204         int num_retry = nfs4_max_mount_retry;
1205         int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1206
1207         sfh = sfh4_get(fh, mi);
1208         recov_state.rs_flags = 0;
1209         recov_state.rs_num_retry_despite_err = 0;
1210
1211 recov_retry:
1212         nfs4_error_zinit(&e);
1213
1214         args.array_len = 2;
1215         args.array = argop;
1216         args.ctag = TAG_GET_SYMLINK;
1217
1218         if (! recovery) {
1219                 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
1220                 if (e.error) {
1221                         sfh4_rele(&sfh);
1222                         return (e.error);
1223                 }
1224         }
1225
1226         /* 0. putfh symlink fh */
1227         argop[0].argop = OP_CPUTFH;
1228         argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1229
1230         /* 1. readlink */
1231         argop[1].argop = OP_READLINK;
1232
1233         doqueue = 1;
1234
1235         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1236
1237         needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1238
1239         if (needrecov && !recovery && num_retry-- > 0) {
1240
1241                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1242                     "getlinktext_otw: initiating recovery\n"));
1243
1244                 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
1245                     OP_READLINK, NULL, NULL, NULL) == FALSE) {
1246                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1247                         if (!e.error)
1248                                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1249                         goto recov_retry;
1250                 }
1251         }
1252
1253         /*
1254          * If non-NFS4 pcol error and/or we weren't able to recover.
1255          */
1256         if (e.error != 0) {
1257                 if (! recovery)
1258                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1259                 sfh4_rele(&sfh);
1260                 return (e.error);
1261         }
1262
1263         if (res.status) {
1264                 e.error = geterrno4(res.status);
1265                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1266                 if (! recovery)
1267                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1268                 sfh4_rele(&sfh);
1269                 return (e.error);
1270         }
1271
1272         /* res.status == NFS4_OK */
1273         ASSERT(res.status == NFS4_OK);
1274
1275         resop = &res.array[1];  /* readlink res */
1276         lr_res = &resop->nfs_resop4_u.opreadlink;
1277
1278         /* treat symlink name as data */
1279         *linktextp = utf8_to_str((utf8string *)&lr_res->link, &len, NULL);
1280
1281         if (! recovery)
1282                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1283         sfh4_rele(&sfh);
1284         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1285         return (0);
1286 }
1287
1288 /*
1289  * Skip over consecutive slashes and "/./" in a pathname.
1290  */
1291 void
1292 pathname_skipslashdot(struct pathname *pnp)
1293 {
1294         char *c1, *c2;
1295
1296         while (pnp->pn_pathlen > 0 && *pnp->pn_path == '/') {
1297
1298                 c1 = pnp->pn_path + 1;
1299                 c2 = pnp->pn_path + 2;
1300
1301                 if (*c1 == '.' && (*c2 == '/' || *c2 == '\0')) {
1302                         pnp->pn_path = pnp->pn_path + 2; /* skip "/." */
1303                         pnp->pn_pathlen = pnp->pn_pathlen - 2;
1304                 } else {
1305                         pnp->pn_path++;
1306                         pnp->pn_pathlen--;
1307                 }
1308         }
1309 }
1310
1311 /*
1312  * Resolve a symbolic link path. The symlink is in the nth component of
1313  * svp->sv_path and has an nfs4 file handle "fh".
1314  * Upon return, the sv_path will point to the new path that has the nth
1315  * component resolved to its symlink text.
1316  */
1317 int
1318 resolve_sympath(mntinfo4_t *mi, servinfo4_t *svp, int nth, nfs_fh4 *fh,
1319     cred_t *cr, int flags)
1320 {
1321         char *oldpath;
1322         char *symlink, *newpath;
1323         struct pathname oldpn, newpn;
1324         char component[MAXNAMELEN];
1325         int i, addlen, error = 0;
1326         int oldpathlen;
1327
1328         /* Get the symbolic link text over the wire. */
1329         error = getlinktext_otw(mi, fh, &symlink, cr, flags);
1330
1331         if (error || symlink == NULL || strlen(symlink) == 0)
1332                 return (error);
1333
1334         /*
1335          * Compose the new pathname.
1336          * Note:
1337          *    - only the nth component is resolved for the pathname.
1338          *    - pathname.pn_pathlen does not count the ending null byte.
1339          */
1340         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1341         oldpath = svp->sv_path;
1342         oldpathlen = svp->sv_pathlen;
1343         if (error = pn_get(oldpath, UIO_SYSSPACE, &oldpn)) {
1344                 nfs_rw_exit(&svp->sv_lock);
1345                 kmem_free(symlink, strlen(symlink) + 1);
1346                 return (error);
1347         }
1348         nfs_rw_exit(&svp->sv_lock);
1349         pn_alloc(&newpn);
1350
1351         /*
1352          * Skip over previous components from the oldpath so that the
1353          * oldpn.pn_path will point to the symlink component. Skip
1354          * leading slashes and "/./" (no OP_LOOKUP on ".") so that
1355          * pn_getcompnent can get the component.
1356          */
1357         for (i = 1; i < nth; i++) {
1358                 pathname_skipslashdot(&oldpn);
1359                 error = pn_getcomponent(&oldpn, component);
1360                 if (error)
1361                         goto out;
1362         }
1363
1364         /*
1365          * Copy the old path upto the component right before the symlink
1366          * if the symlink is not an absolute path.
1367          */
1368         if (symlink[0] != '/') {
1369                 addlen = oldpn.pn_path - oldpn.pn_buf;
1370                 bcopy(oldpn.pn_buf, newpn.pn_path, addlen);
1371                 newpn.pn_pathlen += addlen;
1372                 newpn.pn_path += addlen;
1373                 newpn.pn_buf[newpn.pn_pathlen] = '/';
1374                 newpn.pn_pathlen++;
1375                 newpn.pn_path++;
1376         }
1377
1378         /* copy the resolved symbolic link text */
1379         addlen = strlen(symlink);
1380         if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1381                 error = ENAMETOOLONG;
1382                 goto out;
1383         }
1384         bcopy(symlink, newpn.pn_path, addlen);
1385         newpn.pn_pathlen += addlen;
1386         newpn.pn_path += addlen;
1387
1388         /*
1389          * Check if there is any remaining path after the symlink component.
1390          * First, skip the symlink component.
1391          */
1392         pathname_skipslashdot(&oldpn);
1393         if (error = pn_getcomponent(&oldpn, component))
1394                 goto out;
1395
1396         addlen = pn_pathleft(&oldpn); /* includes counting the slash */
1397
1398         /*
1399          * Copy the remaining path to the new pathname if there is any.
1400          */
1401         if (addlen > 0) {
1402                 if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1403                         error = ENAMETOOLONG;
1404                         goto out;
1405                 }
1406                 bcopy(oldpn.pn_path, newpn.pn_path, addlen);
1407                 newpn.pn_pathlen += addlen;
1408         }
1409         newpn.pn_buf[newpn.pn_pathlen] = '\0';
1410
1411         /* get the newpath and store it in the servinfo4_t */
1412         newpath = kmem_alloc(newpn.pn_pathlen + 1, KM_SLEEP);
1413         bcopy(newpn.pn_buf, newpath, newpn.pn_pathlen);
1414         newpath[newpn.pn_pathlen] = '\0';
1415
1416         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1417         svp->sv_path = newpath;
1418         svp->sv_pathlen = strlen(newpath) + 1;
1419         nfs_rw_exit(&svp->sv_lock);
1420
1421         kmem_free(oldpath, oldpathlen);
1422 out:
1423         kmem_free(symlink, strlen(symlink) + 1);
1424         pn_free(&newpn);
1425         pn_free(&oldpn);
1426
1427         return (error);
1428 }
1429
1430 /*
1431  * This routine updates servinfo4 structure with the new referred server
1432  * info.
1433  * nfsfsloc has the location related information
1434  * fsp has the hostname and pathname info.
1435  * new path = pathname from referral + part of orig pathname(based on nth).
1436  */
1437 static void
1438 update_servinfo4(servinfo4_t *svp, fs_location4 *fsp,
1439     struct nfs_fsl_info *nfsfsloc, char *orig_path, int nth)
1440 {
1441         struct knetconfig *knconf, *svknconf;
1442         struct netbuf *saddr;
1443         sec_data_t      *secdata;
1444         utf8string *host;
1445         int i = 0, num_slashes = 0;
1446         char *p, *spath, *op, *new_path;
1447
1448         /* Update knconf */
1449         knconf = svp->sv_knconf;
1450         free_knconf_contents(knconf);
1451         bzero(knconf, sizeof (struct knetconfig));
1452         svknconf = nfsfsloc->knconf;
1453         knconf->knc_semantics = svknconf->knc_semantics;
1454         knconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1455         knconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1456         knconf->knc_rdev = svknconf->knc_rdev;
1457         bcopy(svknconf->knc_protofmly, knconf->knc_protofmly, KNC_STRSIZE);
1458         bcopy(svknconf->knc_proto, knconf->knc_proto, KNC_STRSIZE);
1459
1460         /* Update server address */
1461         saddr = &svp->sv_addr;
1462         if (saddr->buf != NULL)
1463                 kmem_free(saddr->buf, saddr->maxlen);
1464         saddr->buf  = kmem_alloc(nfsfsloc->addr->maxlen, KM_SLEEP);
1465         saddr->len = nfsfsloc->addr->len;
1466         saddr->maxlen = nfsfsloc->addr->maxlen;
1467         bcopy(nfsfsloc->addr->buf, saddr->buf, nfsfsloc->addr->len);
1468
1469         /* Update server name */
1470         host = fsp->server_val;
1471         kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
1472         svp->sv_hostname = kmem_zalloc(host->utf8string_len + 1, KM_SLEEP);
1473         bcopy(host->utf8string_val, svp->sv_hostname, host->utf8string_len);
1474         svp->sv_hostname[host->utf8string_len] = '\0';
1475         svp->sv_hostnamelen = host->utf8string_len + 1;
1476
1477         /*
1478          * Update server path.
1479          * We need to setup proper path here.
1480          * For ex., If we got a path name serv1:/rp/aaa/bbb
1481          * where aaa is a referral and points to serv2:/rpool/aa
1482          * we need to set the path to serv2:/rpool/aa/bbb
1483          * The first part of this below code generates /rpool/aa
1484          * and the second part appends /bbb to the server path.
1485          */
1486         spath = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1487         *p++ = '/';
1488         for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1489                 component4 *comp;
1490
1491                 comp = &fsp->rootpath.pathname4_val[i];
1492                 /* If no space, null the string and bail */
1493                 if ((p - spath) + comp->utf8string_len + 1 > MAXPATHLEN) {
1494                         p = spath + MAXPATHLEN - 1;
1495                         spath[0] = '\0';
1496                         break;
1497                 }
1498                 bcopy(comp->utf8string_val, p, comp->utf8string_len);
1499                 p += comp->utf8string_len;
1500                 *p++ = '/';
1501         }
1502         if (fsp->rootpath.pathname4_len != 0)
1503                 *(p - 1) = '\0';
1504         else
1505                 *p = '\0';
1506         p = spath;
1507
1508         new_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1509         (void) strlcpy(new_path, p, MAXPATHLEN);
1510         kmem_free(p, MAXPATHLEN);
1511         i = strlen(new_path);
1512
1513         for (op = orig_path; *op; op++) {
1514                 if (*op == '/')
1515                         num_slashes++;
1516                 if (num_slashes == nth + 2) {
1517                         while (*op != '\0') {
1518                                 new_path[i] = *op;
1519                                 i++;
1520                                 op++;
1521                         }
1522                         break;
1523                 }
1524         }
1525         new_path[i] = '\0';
1526
1527         kmem_free(svp->sv_path, svp->sv_pathlen);
1528         svp->sv_pathlen = strlen(new_path) + 1;
1529         svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
1530         bcopy(new_path, svp->sv_path, svp->sv_pathlen);
1531         kmem_free(new_path, MAXPATHLEN);
1532
1533         /*
1534          * All the security data is specific to old server.
1535          * Clean it up except secdata which deals with mount options.
1536          * We need to inherit that data. Copy secdata into our new servinfo4.
1537          */
1538         if (svp->sv_dhsec) {
1539                 sec_clnt_freeinfo(svp->sv_dhsec);
1540                 svp->sv_dhsec = NULL;
1541         }
1542         if (svp->sv_save_secinfo &&
1543             svp->sv_save_secinfo != svp->sv_secinfo) {
1544                 secinfo_free(svp->sv_save_secinfo);
1545                 svp->sv_save_secinfo = NULL;
1546         }
1547         if (svp->sv_secinfo) {
1548                 secinfo_free(svp->sv_secinfo);
1549                 svp->sv_secinfo = NULL;
1550         }
1551         svp->sv_currsec = NULL;
1552
1553         secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1554         *secdata = *svp->sv_secdata;
1555         secdata->data = NULL;
1556         if (svp->sv_secdata) {
1557                 sec_clnt_freeinfo(svp->sv_secdata);
1558                 svp->sv_secdata = NULL;
1559         }
1560         svp->sv_secdata = secdata;
1561 }
1562
1563 /*
1564  * Resolve a referral. The referral is in the n+1th component of
1565  * svp->sv_path and has a parent nfs4 file handle "fh".
1566  * Upon return, the sv_path will point to the new path that has referral
1567  * component resolved to its referred path and part of original path.
1568  * Hostname and other address information is also updated.
1569  */
1570 int
1571 resolve_referral(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, int nth,
1572     nfs_fh4 *fh)
1573 {
1574         nfs4_sharedfh_t *sfh;
1575         struct nfs_fsl_info nfsfsloc;
1576         nfs4_ga_res_t garp;
1577         COMPOUND4res_clnt callres;
1578         fs_location4    *fsp;
1579         char *nm, *orig_path;
1580         int orig_pathlen = 0, ret = -1, index;
1581
1582         if (svp->sv_pathlen <= 0)
1583                 return (ret);
1584
1585         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1586         orig_pathlen = svp->sv_pathlen;
1587         orig_path = kmem_alloc(orig_pathlen, KM_SLEEP);
1588         bcopy(svp->sv_path, orig_path, orig_pathlen);
1589         nm = extract_referral_point(svp->sv_path, nth);
1590         setup_newsvpath(svp, nth);
1591         nfs_rw_exit(&svp->sv_lock);
1592
1593         sfh = sfh4_get(fh, mi);
1594         index = nfs4_process_referral(mi, sfh, nm, cr,
1595             &garp, &callres, &nfsfsloc);
1596         sfh4_rele(&sfh);
1597         kmem_free(nm, MAXPATHLEN);
1598         if (index < 0) {
1599                 kmem_free(orig_path, orig_pathlen);
1600                 return (index);
1601         }
1602
1603         fsp =  &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1604         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1605         update_servinfo4(svp, fsp, &nfsfsloc, orig_path, nth);
1606         nfs_rw_exit(&svp->sv_lock);
1607
1608         mutex_enter(&mi->mi_lock);
1609         mi->mi_vfs_referral_loop_cnt++;
1610         mutex_exit(&mi->mi_lock);
1611
1612         ret = 0;
1613 bad:
1614         /* Free up XDR memory allocated in nfs4_process_referral() */
1615         xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1616         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1617         kmem_free(orig_path, orig_pathlen);
1618
1619         return (ret);
1620 }
1621
1622 /*
1623  * Get the root filehandle for the given filesystem and server, and update
1624  * svp.
1625  *
1626  * If NFS4_GETFH_NEEDSOP is set, then use nfs4_start_fop and nfs4_end_fop
1627  * to coordinate with recovery.  Otherwise, the caller is assumed to be
1628  * the recovery thread or have already done a start_fop.
1629  *
1630  * Errors are returned by the nfs4_error_t parameter.
1631  */
1632 static void
1633 nfs4getfh_otw(struct mntinfo4 *mi, servinfo4_t *svp, vtype_t *vtp,
1634     int flags, cred_t *cr, nfs4_error_t *ep)
1635 {
1636         COMPOUND4args_clnt args;
1637         COMPOUND4res_clnt res;
1638         int doqueue = 1;
1639         nfs_argop4 *argop;
1640         nfs_resop4 *resop;
1641         nfs4_ga_res_t *garp;
1642         int num_argops;
1643         lookup4_param_t lookuparg;
1644         nfs_fh4 *tmpfhp;
1645         nfs_fh4 *resfhp;
1646         bool_t needrecov = FALSE;
1647         nfs4_recov_state_t recov_state;
1648         int llndx;
1649         int nthcomp;
1650         int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1651
1652         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1653         ASSERT(svp->sv_path != NULL);
1654         if (svp->sv_path[0] == '\0') {
1655                 nfs_rw_exit(&svp->sv_lock);
1656                 nfs4_error_init(ep, EINVAL);
1657                 return;
1658         }
1659         nfs_rw_exit(&svp->sv_lock);
1660
1661         recov_state.rs_flags = 0;
1662         recov_state.rs_num_retry_despite_err = 0;
1663
1664 recov_retry:
1665         if (mi->mi_vfs_referral_loop_cnt >= NFS4_REFERRAL_LOOP_MAX) {
1666                 DTRACE_PROBE3(nfs4clnt__debug__referral__loop, mntinfo4 *,
1667                     mi, servinfo4_t *, svp, char *, "nfs4getfh_otw");
1668                 nfs4_error_init(ep, EINVAL);
1669                 return;
1670         }
1671         nfs4_error_zinit(ep);
1672
1673         if (!recovery) {
1674                 ep->error = nfs4_start_fop(mi, NULL, NULL, OH_MOUNT,
1675                     &recov_state, NULL);
1676
1677                 /*
1678                  * If recovery has been started and this request as
1679                  * initiated by a mount, then we must wait for recovery
1680                  * to finish before proceeding, otherwise, the error
1681                  * cleanup would remove data structures needed by the
1682                  * recovery thread.
1683                  */
1684                 if (ep->error) {
1685                         mutex_enter(&mi->mi_lock);
1686                         if (mi->mi_flags & MI4_MOUNTING) {
1687                                 mi->mi_flags |= MI4_RECOV_FAIL;
1688                                 mi->mi_error = EIO;
1689
1690                                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1691                                     "nfs4getfh_otw: waiting 4 recovery\n"));
1692
1693                                 while (mi->mi_flags & MI4_RECOV_ACTIV)
1694                                         cv_wait(&mi->mi_failover_cv,
1695                                             &mi->mi_lock);
1696                         }
1697                         mutex_exit(&mi->mi_lock);
1698                         return;
1699                 }
1700
1701                 /*
1702                  * If the client does not specify a specific flavor to use
1703                  * and has not gotten a secinfo list from the server yet,
1704                  * retrieve the secinfo list from the server and use a
1705                  * flavor from the list to mount.
1706                  *
1707                  * If fail to get the secinfo list from the server, then
1708                  * try the default flavor.
1709                  */
1710                 if ((svp->sv_flags & SV4_TRYSECDEFAULT) &&
1711                     svp->sv_secinfo == NULL) {
1712                         (void) nfs4_secinfo_path(mi, cr, FALSE);
1713                 }
1714         }
1715
1716         if (recovery)
1717                 args.ctag = TAG_REMAP_MOUNT;
1718         else
1719                 args.ctag = TAG_MOUNT;
1720
1721         lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1722         lookuparg.argsp = &args;
1723         lookuparg.resp = &res;
1724         lookuparg.header_len = 2;       /* Putrootfh, getfh */
1725         lookuparg.trailer_len = 0;
1726         lookuparg.ga_bits = FATTR4_FSINFO_MASK;
1727         lookuparg.mi = mi;
1728
1729         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1730         ASSERT(svp->sv_path != NULL);
1731         llndx = nfs4lookup_setup(svp->sv_path, &lookuparg, 0);
1732         nfs_rw_exit(&svp->sv_lock);
1733
1734         argop = args.array;
1735         num_argops = args.array_len;
1736
1737         /* choose public or root filehandle */
1738         if (flags & NFS4_GETFH_PUBLIC)
1739                 argop[0].argop = OP_PUTPUBFH;
1740         else
1741                 argop[0].argop = OP_PUTROOTFH;
1742
1743         /* get fh */
1744         argop[1].argop = OP_GETFH;
1745
1746         NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1747             "nfs4getfh_otw: %s call, mi 0x%p",
1748             needrecov ? "recov" : "first", (void *)mi));
1749
1750         rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1751
1752         needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
1753
1754         if (needrecov) {
1755                 bool_t abort;
1756
1757                 if (recovery) {
1758                         nfs4args_lookup_free(argop, num_argops);
1759                         kmem_free(argop,
1760                             lookuparg.arglen * sizeof (nfs_argop4));
1761                         if (!ep->error)
1762                                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1763                         return;
1764                 }
1765
1766                 NFS4_DEBUG(nfs4_client_recov_debug,
1767                     (CE_NOTE, "nfs4getfh_otw: initiating recovery\n"));
1768
1769                 abort = nfs4_start_recovery(ep, mi, NULL,
1770                     NULL, NULL, NULL, OP_GETFH, NULL, NULL, NULL);
1771                 if (!ep->error) {
1772                         ep->error = geterrno4(res.status);
1773                         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1774                 }
1775                 nfs4args_lookup_free(argop, num_argops);
1776                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1777                 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
1778                 /* have another go? */
1779                 if (abort == FALSE)
1780                         goto recov_retry;
1781                 return;
1782         }
1783
1784         /*
1785          * No recovery, but check if error is set.
1786          */
1787         if (ep->error)  {
1788                 nfs4args_lookup_free(argop, num_argops);
1789                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1790                 if (!recovery)
1791                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1792                             needrecov);
1793                 return;
1794         }
1795
1796 is_link_err:
1797
1798         /* for non-recovery errors */
1799         if (res.status && res.status != NFS4ERR_SYMLINK &&
1800             res.status != NFS4ERR_MOVED) {
1801                 if (!recovery) {
1802                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1803                             needrecov);
1804                 }
1805                 nfs4args_lookup_free(argop, num_argops);
1806                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1807                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1808                 return;
1809         }
1810
1811         /*
1812          * If any intermediate component in the path is a symbolic link,
1813          * resolve the symlink, then try mount again using the new path.
1814          */
1815         if (res.status == NFS4ERR_SYMLINK || res.status == NFS4ERR_MOVED) {
1816                 int where;
1817
1818                 /*
1819                  * Need to call nfs4_end_op before resolve_sympath to avoid
1820                  * potential nfs4_start_op deadlock.
1821                  */
1822                 if (!recovery)
1823                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1824                             needrecov);
1825
1826                 /*
1827                  * This must be from OP_LOOKUP failure. The (cfh) for this
1828                  * OP_LOOKUP is a symlink node. Found out where the
1829                  * OP_GETFH is for the (cfh) that is a symlink node.
1830                  *
1831                  * Example:
1832                  * (mount) PUTROOTFH, GETFH, LOOKUP comp1, GETFH, GETATTR,
1833                  * LOOKUP comp2, GETFH, GETATTR, LOOKUP comp3, GETFH, GETATTR
1834                  *
1835                  * LOOKUP comp3 fails with SYMLINK because comp2 is a symlink.
1836                  * In this case, where = 7, nthcomp = 2.
1837                  */
1838                 where = res.array_len - 2;
1839                 ASSERT(where > 0);
1840
1841                 if (res.status == NFS4ERR_SYMLINK) {
1842
1843                         resop = &res.array[where - 1];
1844                         ASSERT(resop->resop == OP_GETFH);
1845                         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1846                         nthcomp = res.array_len/3 - 1;
1847                         ep->error = resolve_sympath(mi, svp, nthcomp,
1848                             tmpfhp, cr, flags);
1849
1850                 } else if (res.status == NFS4ERR_MOVED) {
1851
1852                         resop = &res.array[where - 2];
1853                         ASSERT(resop->resop == OP_GETFH);
1854                         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1855                         nthcomp = res.array_len/3 - 1;
1856                         ep->error = resolve_referral(mi, svp, cr, nthcomp,
1857                             tmpfhp);
1858                 }
1859
1860                 nfs4args_lookup_free(argop, num_argops);
1861                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1862                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1863
1864                 if (ep->error)
1865                         return;
1866
1867                 goto recov_retry;
1868         }
1869
1870         /* getfh */
1871         resop = &res.array[res.array_len - 2];
1872         ASSERT(resop->resop == OP_GETFH);
1873         resfhp = &resop->nfs_resop4_u.opgetfh.object;
1874
1875         /* getattr fsinfo res */
1876         resop++;
1877         garp = &resop->nfs_resop4_u.opgetattr.ga_res;
1878
1879         *vtp = garp->n4g_va.va_type;
1880
1881         mi->mi_fh_expire_type = garp->n4g_ext_res->n4g_fet;
1882
1883         mutex_enter(&mi->mi_lock);
1884         if (garp->n4g_ext_res->n4g_pc4.pc4_link_support)
1885                 mi->mi_flags |= MI4_LINK;
1886         if (garp->n4g_ext_res->n4g_pc4.pc4_symlink_support)
1887                 mi->mi_flags |= MI4_SYMLINK;
1888         if (garp->n4g_ext_res->n4g_suppattrs & FATTR4_ACL_MASK)
1889                 mi->mi_flags |= MI4_ACL;
1890         mutex_exit(&mi->mi_lock);
1891
1892         if (garp->n4g_ext_res->n4g_maxread == 0)
1893                 mi->mi_tsize =
1894                     MIN(MAXBSIZE, mi->mi_tsize);
1895         else
1896                 mi->mi_tsize =
1897                     MIN(garp->n4g_ext_res->n4g_maxread,
1898                     mi->mi_tsize);
1899
1900         if (garp->n4g_ext_res->n4g_maxwrite == 0)
1901                 mi->mi_stsize =
1902                     MIN(MAXBSIZE, mi->mi_stsize);
1903         else
1904                 mi->mi_stsize =
1905                     MIN(garp->n4g_ext_res->n4g_maxwrite,
1906                     mi->mi_stsize);
1907
1908         if (garp->n4g_ext_res->n4g_maxfilesize != 0)
1909                 mi->mi_maxfilesize =
1910                     MIN(garp->n4g_ext_res->n4g_maxfilesize,
1911                     mi->mi_maxfilesize);
1912
1913         /*
1914          * If the final component is a a symbolic link, resolve the symlink,
1915          * then try mount again using the new path.
1916          *
1917          * Assume no symbolic link for root filesysm "/".
1918          */
1919         if (*vtp == VLNK) {
1920                 /*
1921                  * nthcomp is the total result length minus
1922                  * the 1st 2 OPs (PUTROOTFH, GETFH),
1923                  * then divided by 3 (LOOKUP,GETFH,GETATTR)
1924                  *
1925                  * e.g. PUTROOTFH GETFH LOOKUP 1st-comp GETFH GETATTR
1926                  *      LOOKUP 2nd-comp GETFH GETATTR
1927                  *
1928                  *      (8 - 2)/3 = 2
1929                  */
1930                 nthcomp = (res.array_len - 2)/3;
1931
1932                 /*
1933                  * Need to call nfs4_end_op before resolve_sympath to avoid
1934                  * potential nfs4_start_op deadlock. See RFE 4777612.
1935                  */
1936                 if (!recovery)
1937                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1938                             needrecov);
1939
1940                 ep->error = resolve_sympath(mi, svp, nthcomp, resfhp, cr,
1941                     flags);
1942
1943                 nfs4args_lookup_free(argop, num_argops);
1944                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1945                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1946
1947                 if (ep->error)
1948                         return;
1949
1950                 goto recov_retry;
1951         }
1952
1953         /*
1954          * We need to figure out where in the compound the getfh
1955          * for the parent directory is. If the object to be mounted is
1956          * the root, then there is no lookup at all:
1957          * PUTROOTFH, GETFH.
1958          * If the object to be mounted is in the root, then the compound is:
1959          * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR.
1960          * In either of these cases, the index of the GETFH is 1.
1961          * If it is not at the root, then it's something like:
1962          * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR,
1963          * LOOKUP, GETFH, GETATTR
1964          * In this case, the index is llndx (last lookup index) - 2.
1965          */
1966         if (llndx == -1 || llndx == 2)
1967                 resop = &res.array[1];
1968         else {
1969                 ASSERT(llndx > 2);
1970                 resop = &res.array[llndx-2];
1971         }
1972
1973         ASSERT(resop->resop == OP_GETFH);
1974         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1975
1976         /* save the filehandles for the replica */
1977         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1978         ASSERT(tmpfhp->nfs_fh4_len <= NFS4_FHSIZE);
1979         svp->sv_pfhandle.fh_len = tmpfhp->nfs_fh4_len;
1980         bcopy(tmpfhp->nfs_fh4_val, svp->sv_pfhandle.fh_buf,
1981             tmpfhp->nfs_fh4_len);
1982         ASSERT(resfhp->nfs_fh4_len <= NFS4_FHSIZE);
1983         svp->sv_fhandle.fh_len = resfhp->nfs_fh4_len;
1984         bcopy(resfhp->nfs_fh4_val, svp->sv_fhandle.fh_buf, resfhp->nfs_fh4_len);
1985
1986         /* initialize fsid and supp_attrs for server fs */
1987         svp->sv_fsid = garp->n4g_fsid;
1988         svp->sv_supp_attrs =
1989             garp->n4g_ext_res->n4g_suppattrs | FATTR4_MANDATTR_MASK;
1990
1991         nfs_rw_exit(&svp->sv_lock);
1992         nfs4args_lookup_free(argop, num_argops);
1993         kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1994         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1995         if (!recovery)
1996                 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
1997 }
1998
1999 /*
2000  * Save a copy of Servinfo4_t structure.
2001  * We might need when there is a failure in getting file handle
2002  * in case of a referral to replace servinfo4 struct and try again.
2003  */
2004 static struct servinfo4 *
2005 copy_svp(servinfo4_t *nsvp)
2006 {
2007         servinfo4_t *svp = NULL;
2008         struct knetconfig *sknconf, *tknconf;
2009         struct netbuf *saddr, *taddr;
2010
2011         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2012         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2013         svp->sv_flags = nsvp->sv_flags;
2014         svp->sv_fsid = nsvp->sv_fsid;
2015         svp->sv_hostnamelen = nsvp->sv_hostnamelen;
2016         svp->sv_pathlen = nsvp->sv_pathlen;
2017         svp->sv_supp_attrs = nsvp->sv_supp_attrs;
2018
2019         svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
2020         svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
2021         bcopy(nsvp->sv_hostname, svp->sv_hostname, svp->sv_hostnamelen);
2022         bcopy(nsvp->sv_path, svp->sv_path, svp->sv_pathlen);
2023
2024         saddr = &nsvp->sv_addr;
2025         taddr = &svp->sv_addr;
2026         taddr->maxlen = saddr->maxlen;
2027         taddr->len = saddr->len;
2028         if (saddr->len > 0) {
2029                 taddr->buf = kmem_zalloc(saddr->maxlen, KM_SLEEP);
2030                 bcopy(saddr->buf, taddr->buf, saddr->len);
2031         }
2032
2033         svp->sv_knconf = kmem_zalloc(sizeof (struct knetconfig), KM_SLEEP);
2034         sknconf = nsvp->sv_knconf;
2035         tknconf = svp->sv_knconf;
2036         tknconf->knc_semantics = sknconf->knc_semantics;
2037         tknconf->knc_rdev = sknconf->knc_rdev;
2038         if (sknconf->knc_proto != NULL) {
2039                 tknconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2040                 bcopy(sknconf->knc_proto, (char *)tknconf->knc_proto,
2041                     KNC_STRSIZE);
2042         }
2043         if (sknconf->knc_protofmly != NULL) {
2044                 tknconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2045                 bcopy(sknconf->knc_protofmly, (char *)tknconf->knc_protofmly,
2046                     KNC_STRSIZE);
2047         }
2048
2049         if (nsvp->sv_origknconf != NULL) {
2050                 svp->sv_origknconf = kmem_zalloc(sizeof (struct knetconfig),
2051                     KM_SLEEP);
2052                 sknconf = nsvp->sv_origknconf;
2053                 tknconf = svp->sv_origknconf;
2054                 tknconf->knc_semantics = sknconf->knc_semantics;
2055                 tknconf->knc_rdev = sknconf->knc_rdev;
2056                 if (sknconf->knc_proto != NULL) {
2057                         tknconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2058                         bcopy(sknconf->knc_proto, (char *)tknconf->knc_proto,
2059                             KNC_STRSIZE);
2060                 }
2061                 if (sknconf->knc_protofmly != NULL) {
2062                         tknconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE,
2063                             KM_SLEEP);
2064                         bcopy(sknconf->knc_protofmly,
2065                             (char *)tknconf->knc_protofmly, KNC_STRSIZE);
2066                 }
2067         }
2068
2069         svp->sv_secdata = copy_sec_data(nsvp->sv_secdata);
2070         svp->sv_dhsec = copy_sec_data(svp->sv_dhsec);
2071         /*
2072          * Rest of the security information is not copied as they are built
2073          * with the information available from secdata and dhsec.
2074          */
2075         svp->sv_next = NULL;
2076
2077         return (svp);
2078 }
2079
2080 servinfo4_t *
2081 restore_svp(mntinfo4_t *mi, servinfo4_t *svp, servinfo4_t *origsvp)
2082 {
2083         servinfo4_t *srvnext, *tmpsrv;
2084
2085         if (strcmp(svp->sv_hostname, origsvp->sv_hostname) != 0) {
2086                 /*
2087                  * Since the hostname changed, we must be dealing
2088                  * with a referral, and the lookup failed.  We will
2089                  * restore the whole servinfo4_t to what it was before.
2090                  */
2091                 srvnext = svp->sv_next;
2092                 svp->sv_next = NULL;
2093                 tmpsrv = copy_svp(origsvp);
2094                 sv4_free(svp);
2095                 svp = tmpsrv;
2096                 svp->sv_next = srvnext;
2097                 mutex_enter(&mi->mi_lock);
2098                 mi->mi_servers = svp;
2099                 mi->mi_curr_serv = svp;
2100                 mutex_exit(&mi->mi_lock);
2101
2102         } else if (origsvp->sv_pathlen != svp->sv_pathlen) {
2103
2104                 /*
2105                  * For symlink case: restore original path because
2106                  * it might have contained symlinks that were
2107                  * expanded by nfsgetfh_otw before the failure occurred.
2108                  */
2109                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2110                 kmem_free(svp->sv_path, svp->sv_pathlen);
2111                 svp->sv_path =
2112                     kmem_alloc(origsvp->sv_pathlen, KM_SLEEP);
2113                 svp->sv_pathlen = origsvp->sv_pathlen;
2114                 bcopy(origsvp->sv_path, svp->sv_path,
2115                     origsvp->sv_pathlen);
2116                 nfs_rw_exit(&svp->sv_lock);
2117         }
2118         return (svp);
2119 }
2120
2121 static ushort_t nfs4_max_threads = 8;   /* max number of active async threads */
2122 uint_t nfs4_bsize = 32 * 1024;  /* client `block' size */
2123 static uint_t nfs4_async_clusters = 1;  /* # of reqs from each async queue */
2124 static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO;
2125
2126 /*
2127  * Remap the root filehandle for the given filesystem.
2128  *
2129  * results returned via the nfs4_error_t parameter.
2130  */
2131 void
2132 nfs4_remap_root(mntinfo4_t *mi, nfs4_error_t *ep, int flags)
2133 {
2134         struct servinfo4 *svp, *origsvp;
2135         vtype_t vtype;
2136         nfs_fh4 rootfh;
2137         int getfh_flags;
2138         int num_retry;
2139
2140         mutex_enter(&mi->mi_lock);
2141
2142 remap_retry:
2143         svp = mi->mi_curr_serv;
2144         getfh_flags =
2145             (flags & NFS4_REMAP_NEEDSOP) ? NFS4_GETFH_NEEDSOP : 0;
2146         getfh_flags |=
2147             (mi->mi_flags & MI4_PUBLIC) ? NFS4_GETFH_PUBLIC : 0;
2148         mutex_exit(&mi->mi_lock);
2149
2150         /*
2151          * Just in case server path being mounted contains
2152          * symlinks and fails w/STALE, save the initial sv_path
2153          * so we can redrive the initial mount compound with the
2154          * initial sv_path -- not a symlink-expanded version.
2155          *
2156          * This could only happen if a symlink was expanded
2157          * and the expanded mount compound failed stale.  Because
2158          * it could be the case that the symlink was removed at
2159          * the server (and replaced with another symlink/dir,
2160          * we need to use the initial sv_path when attempting
2161          * to re-lookup everything and recover.
2162          */
2163         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2164         origsvp = copy_svp(svp);
2165         nfs_rw_exit(&svp->sv_lock);
2166
2167         num_retry = nfs4_max_mount_retry;
2168
2169         do {
2170                 /*
2171                  * Get the root fh from the server.  Retry nfs4_max_mount_retry
2172                  * (2) times if it fails with STALE since the recovery
2173                  * infrastructure doesn't do STALE recovery for components
2174                  * of the server path to the object being mounted.
2175                  */
2176                 nfs4getfh_otw(mi, svp, &vtype, getfh_flags, CRED(), ep);
2177
2178                 if (ep->error == 0 && ep->stat == NFS4_OK)
2179                         break;
2180
2181                 /*
2182                  * For some reason, the mount compound failed.  Before
2183                  * retrying, we need to restore original conditions.
2184                  */
2185                 svp = restore_svp(mi, svp, origsvp);
2186
2187         } while (num_retry-- > 0);
2188
2189         sv4_free(origsvp);
2190
2191         if (ep->error != 0 || ep->stat != 0) {
2192                 return;
2193         }
2194
2195         if (vtype != VNON && vtype != mi->mi_type) {
2196                 /* shouldn't happen */
2197                 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2198                     "nfs4_remap_root: server root vnode type (%d) doesn't "
2199                     "match mount info (%d)", vtype, mi->mi_type);
2200         }
2201
2202         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2203         rootfh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2204         rootfh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2205         nfs_rw_exit(&svp->sv_lock);
2206         sfh4_update(mi->mi_rootfh, &rootfh);
2207
2208         /*
2209          * It's possible that recovery took place on the filesystem
2210          * and the server has been updated between the time we did
2211          * the nfs4getfh_otw and now. Re-drive the otw operation
2212          * to make sure we have a good fh.
2213          */
2214         mutex_enter(&mi->mi_lock);
2215         if (mi->mi_curr_serv != svp)
2216                 goto remap_retry;
2217
2218         mutex_exit(&mi->mi_lock);
2219 }
2220
2221 static int
2222 nfs4rootvp(vnode_t **rtvpp, vfs_t *vfsp, struct servinfo4 *svp_head,
2223     int flags, cred_t *cr, zone_t *zone)
2224 {
2225         vnode_t *rtvp = NULL;
2226         mntinfo4_t *mi;
2227         dev_t nfs_dev;
2228         int error = 0;
2229         rnode4_t *rp;
2230         int i, len;
2231         struct vattr va;
2232         vtype_t vtype = VNON;
2233         vtype_t tmp_vtype = VNON;
2234         struct servinfo4 *firstsvp = NULL, *svp = svp_head;
2235         nfs4_oo_hash_bucket_t *bucketp;
2236         nfs_fh4 fh;
2237         char *droptext = "";
2238         struct nfs_stats *nfsstatsp;
2239         nfs4_fname_t *mfname;
2240         nfs4_error_t e;
2241         int num_retry, removed;
2242         cred_t *lcr = NULL, *tcr = cr;
2243         struct servinfo4 *origsvp;
2244         char *resource;
2245
2246         nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
2247         ASSERT(nfsstatsp != NULL);
2248
2249         ASSERT(nfs_zone() == zone);
2250         ASSERT(crgetref(cr));
2251
2252         /*
2253          * Create a mount record and link it to the vfs struct.
2254          */
2255         mi = kmem_zalloc(sizeof (*mi), KM_SLEEP);
2256         mutex_init(&mi->mi_lock, NULL, MUTEX_DEFAULT, NULL);
2257         nfs_rw_init(&mi->mi_recovlock, NULL, RW_DEFAULT, NULL);
2258         nfs_rw_init(&mi->mi_rename_lock, NULL, RW_DEFAULT, NULL);
2259         nfs_rw_init(&mi->mi_fh_lock, NULL, RW_DEFAULT, NULL);
2260
2261         if (!(flags & NFSMNT_SOFT))
2262                 mi->mi_flags |= MI4_HARD;
2263         if ((flags & NFSMNT_NOPRINT))
2264                 mi->mi_flags |= MI4_NOPRINT;
2265         if (flags & NFSMNT_INT)
2266                 mi->mi_flags |= MI4_INT;
2267         if (flags & NFSMNT_PUBLIC)
2268                 mi->mi_flags |= MI4_PUBLIC;
2269         if (flags & NFSMNT_MIRRORMOUNT)
2270                 mi->mi_flags |= MI4_MIRRORMOUNT;
2271         if (flags & NFSMNT_REFERRAL)
2272                 mi->mi_flags |= MI4_REFERRAL;
2273         mi->mi_retrans = NFS_RETRIES;
2274         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
2275             svp->sv_knconf->knc_semantics == NC_TPI_COTS)
2276                 mi->mi_timeo = nfs4_cots_timeo;
2277         else
2278                 mi->mi_timeo = NFS_TIMEO;
2279         mi->mi_prog = NFS_PROGRAM;
2280         mi->mi_vers = NFS_V4;
2281         mi->mi_rfsnames = rfsnames_v4;
2282         mi->mi_reqs = nfsstatsp->nfs_stats_v4.rfsreqcnt_ptr;
2283         cv_init(&mi->mi_failover_cv, NULL, CV_DEFAULT, NULL);
2284         mi->mi_servers = svp;
2285         mi->mi_curr_serv = svp;
2286         mi->mi_acregmin = SEC2HR(ACREGMIN);
2287         mi->mi_acregmax = SEC2HR(ACREGMAX);
2288         mi->mi_acdirmin = SEC2HR(ACDIRMIN);
2289         mi->mi_acdirmax = SEC2HR(ACDIRMAX);
2290         mi->mi_fh_expire_type = FH4_PERSISTENT;
2291         mi->mi_clientid_next = NULL;
2292         mi->mi_clientid_prev = NULL;
2293         mi->mi_srv = NULL;
2294         mi->mi_grace_wait = 0;
2295         mi->mi_error = 0;
2296         mi->mi_srvsettime = 0;
2297         mi->mi_srvset_cnt = 0;
2298
2299         mi->mi_count = 1;
2300
2301         mi->mi_tsize = nfs4_tsize(svp->sv_knconf);
2302         mi->mi_stsize = mi->mi_tsize;
2303
2304         if (flags & NFSMNT_DIRECTIO)
2305                 mi->mi_flags |= MI4_DIRECTIO;
2306
2307         mi->mi_flags |= MI4_MOUNTING;
2308
2309         /*
2310          * Make a vfs struct for nfs.  We do this here instead of below
2311          * because rtvp needs a vfs before we can do a getattr on it.
2312          *
2313          * Assign a unique device id to the mount
2314          */
2315         mutex_enter(&nfs_minor_lock);
2316         do {
2317                 nfs_minor = (nfs_minor + 1) & MAXMIN32;
2318                 nfs_dev = makedevice(nfs_major, nfs_minor);
2319         } while (vfs_devismounted(nfs_dev));
2320         mutex_exit(&nfs_minor_lock);
2321
2322         vfsp->vfs_dev = nfs_dev;
2323         vfs_make_fsid(&vfsp->vfs_fsid, nfs_dev, nfs4fstyp);
2324         vfsp->vfs_data = (caddr_t)mi;
2325         vfsp->vfs_fstype = nfsfstyp;
2326         vfsp->vfs_bsize = nfs4_bsize;
2327
2328         /*
2329          * Initialize fields used to support async putpage operations.
2330          */
2331         for (i = 0; i < NFS4_ASYNC_TYPES; i++)
2332                 mi->mi_async_clusters[i] = nfs4_async_clusters;
2333         mi->mi_async_init_clusters = nfs4_async_clusters;
2334         mi->mi_async_curr[NFS4_ASYNC_QUEUE] =
2335             mi->mi_async_curr[NFS4_ASYNC_PGOPS_QUEUE] = &mi->mi_async_reqs[0];
2336         mi->mi_max_threads = nfs4_max_threads;
2337         mutex_init(&mi->mi_async_lock, NULL, MUTEX_DEFAULT, NULL);
2338         cv_init(&mi->mi_async_reqs_cv, NULL, CV_DEFAULT, NULL);
2339         cv_init(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE], NULL, CV_DEFAULT,
2340             NULL);
2341         cv_init(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE], NULL,
2342             CV_DEFAULT, NULL);
2343         cv_init(&mi->mi_async_cv, NULL, CV_DEFAULT, NULL);
2344         cv_init(&mi->mi_inact_req_cv, NULL, CV_DEFAULT, NULL);
2345
2346         mi->mi_vfsp = vfsp;
2347         mi->mi_zone = zone;
2348         zone_init_ref(&mi->mi_zone_ref);
2349         zone_hold_ref(zone, &mi->mi_zone_ref, ZONE_REF_NFSV4);
2350         nfs4_mi_zonelist_add(mi);
2351
2352         /*
2353          * Initialize the <open owner/cred> hash table.
2354          */
2355         for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
2356                 bucketp = &(mi->mi_oo_list[i]);
2357                 mutex_init(&bucketp->b_lock, NULL, MUTEX_DEFAULT, NULL);
2358                 list_create(&bucketp->b_oo_hash_list,
2359                     sizeof (nfs4_open_owner_t),
2360                     offsetof(nfs4_open_owner_t, oo_hash_node));
2361         }
2362
2363         /*
2364          * Initialize the freed open owner list.
2365          */
2366         mi->mi_foo_num = 0;
2367         mi->mi_foo_max = NFS4_NUM_FREED_OPEN_OWNERS;
2368         list_create(&mi->mi_foo_list, sizeof (nfs4_open_owner_t),
2369             offsetof(nfs4_open_owner_t, oo_foo_node));
2370
2371         list_create(&mi->mi_lost_state, sizeof (nfs4_lost_rqst_t),
2372             offsetof(nfs4_lost_rqst_t, lr_node));
2373
2374         list_create(&mi->mi_bseqid_list, sizeof (nfs4_bseqid_entry_t),
2375             offsetof(nfs4_bseqid_entry_t, bs_node));
2376
2377         /*
2378          * Initialize the msg buffer.
2379          */
2380         list_create(&mi->mi_msg_list, sizeof (nfs4_debug_msg_t),
2381             offsetof(nfs4_debug_msg_t, msg_node));
2382         mi->mi_msg_count = 0;
2383         mutex_init(&mi->mi_msg_list_lock, NULL, MUTEX_DEFAULT, NULL);
2384
2385         /*
2386          * Initialize kstats
2387          */
2388         nfs4_mnt_kstat_init(vfsp);
2389
2390         /*
2391          * Initialize the shared filehandle pool.
2392          */
2393         sfh4_createtab(&mi->mi_filehandles);
2394
2395         /*
2396          * Save server path we're attempting to mount.
2397          */
2398         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2399         origsvp = copy_svp(svp);
2400         nfs_rw_exit(&svp->sv_lock);
2401
2402         /*
2403          * Make the GETFH call to get root fh for each replica.
2404          */
2405         if (svp_head->sv_next)
2406                 droptext = ", dropping replica";
2407
2408         /*
2409          * If the uid is set then set the creds for secure mounts
2410          * by proxy processes such as automountd.
2411          */
2412         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2413         if (svp->sv_secdata->uid != 0 &&
2414             svp->sv_secdata->rpcflavor == RPCSEC_GSS) {
2415                 lcr = crdup(cr);
2416                 (void) crsetugid(lcr, svp->sv_secdata->uid, crgetgid(cr));
2417                 tcr = lcr;
2418         }
2419         nfs_rw_exit(&svp->sv_lock);
2420         for (svp = svp_head; svp; svp = svp->sv_next) {
2421                 if (nfs4_chkdup_servinfo4(svp_head, svp)) {
2422                         nfs_cmn_err(error, CE_WARN,
2423                             VERS_MSG "Host %s is a duplicate%s",
2424                             svp->sv_hostname, droptext);
2425                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2426                         svp->sv_flags |= SV4_NOTINUSE;
2427                         nfs_rw_exit(&svp->sv_lock);
2428                         continue;
2429                 }
2430                 mi->mi_curr_serv = svp;
2431
2432                 /*
2433                  * Just in case server path being mounted contains
2434                  * symlinks and fails w/STALE, save the initial sv_path
2435                  * so we can redrive the initial mount compound with the
2436                  * initial sv_path -- not a symlink-expanded version.
2437                  *
2438                  * This could only happen if a symlink was expanded
2439                  * and the expanded mount compound failed stale.  Because
2440                  * it could be the case that the symlink was removed at
2441                  * the server (and replaced with another symlink/dir,
2442                  * we need to use the initial sv_path when attempting
2443                  * to re-lookup everything and recover.
2444                  *
2445                  * Other mount errors should evenutally be handled here also
2446                  * (NFS4ERR_DELAY, NFS4ERR_RESOURCE).  For now, all mount
2447                  * failures will result in mount being redriven a few times.
2448                  */
2449                 num_retry = nfs4_max_mount_retry;
2450                 do {
2451                         nfs4getfh_otw(mi, svp, &tmp_vtype,
2452                             ((flags & NFSMNT_PUBLIC) ? NFS4_GETFH_PUBLIC : 0) |
2453                             NFS4_GETFH_NEEDSOP, tcr, &e);
2454
2455                         if (e.error == 0 && e.stat == NFS4_OK)
2456                                 break;
2457
2458                         /*
2459                          * For some reason, the mount compound failed.  Before
2460                          * retrying, we need to restore original conditions.
2461                          */
2462                         svp = restore_svp(mi, svp, origsvp);
2463                         svp_head = svp;
2464
2465                 } while (num_retry-- > 0);
2466                 error = e.error ? e.error : geterrno4(e.stat);
2467                 if (error) {
2468                         nfs_cmn_err(error, CE_WARN,
2469                             VERS_MSG "initial call to %s failed%s: %m",
2470                             svp->sv_hostname, droptext);
2471                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2472                         svp->sv_flags |= SV4_NOTINUSE;
2473                         nfs_rw_exit(&svp->sv_lock);
2474                         mi->mi_flags &= ~MI4_RECOV_FAIL;
2475                         mi->mi_error = 0;
2476                         continue;
2477                 }
2478
2479                 if (tmp_vtype == VBAD) {
2480                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2481                             VERS_MSG "%s returned a bad file type for "
2482                             "root%s", svp->sv_hostname, droptext);
2483                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2484                         svp->sv_flags |= SV4_NOTINUSE;
2485                         nfs_rw_exit(&svp->sv_lock);
2486                         continue;
2487                 }
2488
2489                 if (vtype == VNON) {
2490                         vtype = tmp_vtype;
2491                 } else if (vtype != tmp_vtype) {
2492                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2493                             VERS_MSG "%s returned a different file type "
2494                             "for root%s", svp->sv_hostname, droptext);
2495                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2496                         svp->sv_flags |= SV4_NOTINUSE;
2497                         nfs_rw_exit(&svp->sv_lock);
2498                         continue;
2499                 }
2500                 if (firstsvp == NULL)
2501                         firstsvp = svp;
2502         }
2503
2504         if (firstsvp == NULL) {
2505                 if (error == 0)
2506                         error = ENOENT;
2507                 goto bad;
2508         }
2509
2510         mi->mi_curr_serv = svp = firstsvp;
2511         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2512         ASSERT((mi->mi_curr_serv->sv_flags & SV4_NOTINUSE) == 0);
2513         fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2514         fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2515         mi->mi_rootfh = sfh4_get(&fh, mi);
2516         fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
2517         fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
2518         mi->mi_srvparentfh = sfh4_get(&fh, mi);
2519         nfs_rw_exit(&svp->sv_lock);
2520
2521         /*
2522          * Get the fname for filesystem root.
2523          */
2524         mi->mi_fname = fn_get(NULL, ".", mi->mi_rootfh);
2525         mfname = mi->mi_fname;
2526         fn_hold(mfname);
2527
2528         /*
2529          * Make the root vnode without attributes.
2530          */
2531         rtvp = makenfs4node_by_fh(mi->mi_rootfh, NULL,
2532             &mfname, NULL, mi, cr, gethrtime());
2533         rtvp->v_type = vtype;
2534
2535         mi->mi_curread = mi->mi_tsize;
2536         mi->mi_curwrite = mi->mi_stsize;
2537
2538         /*
2539          * Start the manager thread responsible for handling async worker
2540          * threads.
2541          */
2542         MI4_HOLD(mi);
2543         VFS_HOLD(vfsp); /* add reference for thread */
2544         mi->mi_manager_thread = zthread_create(NULL, 0, nfs4_async_manager,
2545             vfsp, 0, minclsyspri);
2546         ASSERT(mi->mi_manager_thread != NULL);
2547
2548         /*
2549          * Create the thread that handles over-the-wire calls for
2550          * fop_inactive.
2551          * This needs to happen after the manager thread is created.
2552          */
2553         MI4_HOLD(mi);
2554         mi->mi_inactive_thread = zthread_create(NULL, 0, nfs4_inactive_thread,
2555             mi, 0, minclsyspri);
2556         ASSERT(mi->mi_inactive_thread != NULL);
2557
2558         /* If we didn't get a type, get one now */
2559         if (rtvp->v_type == VNON) {
2560                 va.va_mask = AT_TYPE;
2561                 error = nfs4getattr(rtvp, &va, tcr);
2562                 if (error)
2563                         goto bad;
2564                 rtvp->v_type = va.va_type;
2565         }
2566
2567         mi->mi_type = rtvp->v_type;
2568
2569         mutex_enter(&mi->mi_lock);
2570         mi->mi_flags &= ~MI4_MOUNTING;
2571         mutex_exit(&mi->mi_lock);
2572
2573         /* Update VFS with new server and path info */
2574         if ((strcmp(svp->sv_hostname, origsvp->sv_hostname) != 0) ||
2575             (strcmp(svp->sv_path, origsvp->sv_path) != 0)) {
2576                 len = svp->sv_hostnamelen + svp->sv_pathlen;
2577                 resource = kmem_zalloc(len, KM_SLEEP);
2578                 (void) strcat(resource, svp->sv_hostname);
2579                 (void) strcat(resource, ":");
2580                 (void) strcat(resource, svp->sv_path);
2581                 vfs_setresource(vfsp, resource, 0);
2582                 kmem_free(resource, len);
2583         }
2584
2585         sv4_free(origsvp);
2586         *rtvpp = rtvp;
2587         if (lcr != NULL)
2588                 crfree(lcr);
2589
2590         return (0);
2591 bad:
2592         /*
2593          * An error occurred somewhere, need to clean up...
2594          */
2595         if (lcr != NULL)
2596                 crfree(lcr);
2597
2598         if (rtvp != NULL) {
2599                 /*
2600                  * We need to release our reference to the root vnode and
2601                  * destroy the mntinfo4 struct that we just created.
2602                  */
2603                 rp = VTOR4(rtvp);
2604                 if (rp->r_flags & R4HASHED)
2605                         rp4_rmhash(rp);
2606                 VN_RELE(rtvp);
2607         }
2608         nfs4_async_stop(vfsp);
2609         nfs4_async_manager_stop(vfsp);
2610         removed = nfs4_mi_zonelist_remove(mi);
2611         if (removed)
2612                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2613
2614         /*
2615          * This releases the initial "hold" of the mi since it will never
2616          * be referenced by the vfsp.  Also, when mount returns to vfs.c
2617          * with an error, the vfsp will be destroyed, not rele'd.
2618          */
2619         MI4_RELE(mi);
2620
2621         if (origsvp != NULL)
2622                 sv4_free(origsvp);
2623
2624         *rtvpp = NULL;
2625         return (error);
2626 }
2627
2628 /*
2629  * vfs operations
2630  */
2631 static int
2632 nfs4_unmount(vfs_t *vfsp, int flag, cred_t *cr)
2633 {
2634         mntinfo4_t              *mi;
2635         ushort_t                omax;
2636         int                     removed;
2637
2638         bool_t                  must_unlock;
2639
2640         nfs4_ephemeral_tree_t   *eph_tree;
2641
2642         if (secpolicy_fs_unmount(cr, vfsp) != 0)
2643                 return (EPERM);
2644
2645         mi = VFTOMI4(vfsp);
2646
2647         if (flag & MS_FORCE) {
2648                 vfsp->vfs_flag |= VFS_UNMOUNTED;
2649                 if (nfs_zone() != mi->mi_zone) {
2650                         /*
2651                          * If the request is coming from the wrong zone,
2652                          * we don't want to create any new threads, and
2653                          * performance is not a concern.  Do everything
2654                          * inline.
2655                          */
2656                         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2657                             "nfs4_unmount x-zone forced unmount of vfs %p\n",
2658                             (void *)vfsp));
2659                         nfs4_free_mount(vfsp, flag, cr);
2660                 } else {
2661                         /*
2662                          * Free data structures asynchronously, to avoid
2663                          * blocking the current thread (for performance
2664                          * reasons only).
2665                          */
2666                         async_free_mount(vfsp, flag, cr);
2667                 }
2668
2669                 return (0);
2670         }
2671
2672         /*
2673          * Wait until all asynchronous putpage operations on
2674          * this file system are complete before flushing rnodes
2675          * from the cache.
2676          */
2677         omax = mi->mi_max_threads;
2678         if (nfs4_async_stop_sig(vfsp))
2679                 return (EINTR);
2680
2681         r4flush(vfsp, cr);
2682
2683         /*
2684          * About the only reason that this would fail would be
2685          * that the harvester is already busy tearing down this
2686          * node. So we fail back to the caller and let them try
2687          * again when needed.
2688          */
2689         if (nfs4_ephemeral_umount(mi, flag, cr,
2690             &must_unlock, &eph_tree)) {
2691                 ASSERT(must_unlock == FALSE);
2692                 mutex_enter(&mi->mi_async_lock);
2693                 mi->mi_max_threads = omax;
2694                 mutex_exit(&mi->mi_async_lock);
2695
2696                 return (EBUSY);
2697         }
2698
2699         /*
2700          * If there are any active vnodes on this file system,
2701          * then the file system is busy and can't be unmounted.
2702          */
2703         if (check_rtable4(vfsp)) {
2704                 nfs4_ephemeral_umount_unlock(&must_unlock, &eph_tree);
2705
2706                 mutex_enter(&mi->mi_async_lock);
2707                 mi->mi_max_threads = omax;
2708                 mutex_exit(&mi->mi_async_lock);
2709
2710                 return (EBUSY);
2711         }
2712
2713         /*
2714          * The unmount can't fail from now on, so record any
2715          * ephemeral changes.
2716          */
2717         nfs4_ephemeral_umount_activate(mi, &must_unlock, &eph_tree);
2718
2719         /*
2720          * There are no active files that could require over-the-wire
2721          * calls to the server, so stop the async manager and the
2722          * inactive thread.
2723          */
2724         nfs4_async_manager_stop(vfsp);
2725
2726         /*
2727          * Destroy all rnodes belonging to this file system from the
2728          * rnode hash queues and purge any resources allocated to
2729          * them.
2730          */
2731         destroy_rtable4(vfsp, cr);
2732         vfsp->vfs_flag |= VFS_UNMOUNTED;
2733
2734         nfs4_remove_mi_from_server(mi, NULL);
2735         removed = nfs4_mi_zonelist_remove(mi);
2736         if (removed)
2737                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2738
2739         return (0);
2740 }
2741
2742 /*
2743  * find root of nfs
2744  */
2745 static int
2746 nfs4_root(vfs_t *vfsp, vnode_t **vpp)
2747 {
2748         mntinfo4_t *mi;
2749         vnode_t *vp;
2750         nfs4_fname_t *mfname;
2751         servinfo4_t *svp;
2752
2753         mi = VFTOMI4(vfsp);
2754
2755         if (nfs_zone() != mi->mi_zone)
2756                 return (EPERM);
2757
2758         svp = mi->mi_curr_serv;
2759         if (svp) {
2760                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2761                 if (svp->sv_flags & SV4_ROOT_STALE) {
2762                         nfs_rw_exit(&svp->sv_lock);
2763
2764                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2765                         if (svp->sv_flags & SV4_ROOT_STALE) {
2766                                 svp->sv_flags &= ~SV4_ROOT_STALE;
2767                                 nfs_rw_exit(&svp->sv_lock);
2768                                 return (ENOENT);
2769                         }
2770                         nfs_rw_exit(&svp->sv_lock);
2771                 } else
2772                         nfs_rw_exit(&svp->sv_lock);
2773         }
2774
2775         mfname = mi->mi_fname;
2776         fn_hold(mfname);
2777         vp = makenfs4node_by_fh(mi->mi_rootfh, NULL, &mfname, NULL,
2778             VFTOMI4(vfsp), CRED(), gethrtime());
2779
2780         if (VTOR4(vp)->r_flags & R4STALE) {
2781                 VN_RELE(vp);
2782                 return (ENOENT);
2783         }
2784
2785         ASSERT(vp->v_type == VNON || vp->v_type == mi->mi_type);
2786
2787         vp->v_type = mi->mi_type;
2788
2789         *vpp = vp;
2790
2791         return (0);
2792 }
2793
2794 static int
2795 nfs4_statfs_otw(vnode_t *vp, struct statvfs64 *sbp, cred_t *cr)
2796 {
2797         int error;
2798         nfs4_ga_res_t gar;
2799         nfs4_ga_ext_res_t ger;
2800
2801         gar.n4g_ext_res = &ger;
2802
2803         if (error = nfs4_attr_otw(vp, TAG_FSINFO, &gar,
2804             NFS4_STATFS_ATTR_MASK, cr))
2805                 return (error);
2806
2807         *sbp = gar.n4g_ext_res->n4g_sb;
2808
2809         return (0);
2810 }
2811
2812 /*
2813  * Get file system statistics.
2814  */
2815 static int
2816 nfs4_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
2817 {
2818         int error;
2819         vnode_t *vp;
2820         cred_t *cr;
2821
2822         error = nfs4_root(vfsp, &vp);
2823         if (error)
2824                 return (error);
2825
2826         cr = CRED();
2827
2828         error = nfs4_statfs_otw(vp, sbp, cr);
2829         if (!error) {
2830                 (void) strncpy(sbp->f_basetype,
2831                     vfssw[vfsp->vfs_fstype].vsw_name, FSTYPSZ);
2832                 sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
2833         } else {
2834                 nfs4_purge_stale_fh(error, vp, cr);
2835         }
2836
2837         VN_RELE(vp);
2838
2839         return (error);
2840 }
2841
2842 static kmutex_t nfs4_syncbusy;
2843
2844 /*
2845  * Flush dirty nfs files for file system vfsp.
2846  * If vfsp == NULL, all nfs files are flushed.
2847  *
2848  * SYNC_CLOSE in flag is passed to us to
2849  * indicate that we are shutting down and or
2850  * rebooting.
2851  */
2852 static int
2853 nfs4_sync(vfs_t *vfsp, short flag, cred_t *cr)
2854 {
2855         /*
2856          * Cross-zone calls are OK here, since this translates to a
2857          * fop_putpage(B_ASYNC), which gets picked up by the right zone.
2858          */
2859         if (!(flag & SYNC_ATTR) && mutex_tryenter(&nfs4_syncbusy) != 0) {
2860                 r4flush(vfsp, cr);
2861                 mutex_exit(&nfs4_syncbusy);
2862         }
2863
2864         /*
2865          * if SYNC_CLOSE is set then we know that
2866          * the system is rebooting, mark the mntinfo
2867          * for later examination.
2868          */
2869         if (vfsp && (flag & SYNC_CLOSE)) {
2870                 mntinfo4_t *mi;
2871
2872                 mi = VFTOMI4(vfsp);
2873                 if (!(mi->mi_flags & MI4_SHUTDOWN)) {
2874                         mutex_enter(&mi->mi_lock);
2875                         mi->mi_flags |= MI4_SHUTDOWN;
2876                         mutex_exit(&mi->mi_lock);
2877                 }
2878         }
2879         return (0);
2880 }
2881
2882 /*
2883  * vget is difficult, if not impossible, to support in v4 because we don't
2884  * know the parent directory or name, which makes it impossible to create a
2885  * useful shadow vnode.  And we need the shadow vnode for things like
2886  * OPEN.
2887  */
2888
2889 /* ARGSUSED */
2890 /*
2891  * XXX Check nfs4_vget_pseudo() for dependency.
2892  */
2893 static int
2894 nfs4_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
2895 {
2896         return (EREMOTE);
2897 }
2898
2899 /*
2900  * nfs4_mountroot get called in the case where we are diskless booting.  All
2901  * we need from here is the ability to get the server info and from there we
2902  * can simply call nfs4_rootvp.
2903  */
2904 /* ARGSUSED */
2905 static int
2906 nfs4_mountroot(vfs_t *vfsp, whymountroot_t why)
2907 {
2908         vnode_t *rtvp;
2909         char root_hostname[SYS_NMLN+1];
2910         struct servinfo4 *svp;
2911         int error;
2912         int vfsflags;
2913         size_t size;
2914         char *root_path;
2915         struct pathname pn;
2916         char *name;
2917         cred_t *cr;
2918         mntinfo4_t *mi;
2919         struct nfs_args args;           /* nfs mount arguments */
2920         static char token[10];
2921         nfs4_error_t n4e;
2922
2923         bzero(&args, sizeof (args));
2924
2925         /* do this BEFORE getfile which causes xid stamps to be initialized */
2926         clkset(-1L);            /* hack for now - until we get time svc? */
2927
2928         if (why == ROOT_REMOUNT) {
2929                 /*
2930                  * Shouldn't happen.
2931                  */
2932                 panic("nfs4_mountroot: why == ROOT_REMOUNT");
2933         }
2934
2935         if (why == ROOT_UNMOUNT) {
2936                 /*
2937                  * Nothing to do for NFS.
2938                  */
2939                 return (0);
2940         }
2941
2942         /*
2943          * why == ROOT_INIT
2944          */
2945
2946         name = token;
2947         *name = 0;
2948         (void) getfsname("root", name, sizeof (token));
2949
2950         pn_alloc(&pn);
2951         root_path = pn.pn_path;
2952
2953         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2954         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2955         svp->sv_knconf = kmem_zalloc(sizeof (*svp->sv_knconf), KM_SLEEP);
2956         svp->sv_knconf->knc_protofmly = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2957         svp->sv_knconf->knc_proto = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2958
2959         /*
2960          * Get server address
2961          * Get the root path
2962          * Get server's transport
2963          * Get server's hostname
2964          * Get options
2965          */
2966         args.addr = &svp->sv_addr;
2967         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2968         args.fh = (char *)&svp->sv_fhandle;
2969         args.knconf = svp->sv_knconf;
2970         args.hostname = root_hostname;
2971         vfsflags = 0;
2972         if (error = mount_root(*name ? name : "root", root_path, NFS_V4,
2973             &args, &vfsflags)) {
2974                 if (error == EPROTONOSUPPORT)
2975                         nfs_cmn_err(error, CE_WARN, "nfs4_mountroot: "
2976                             "mount_root failed: server doesn't support NFS V4");
2977                 else
2978                         nfs_cmn_err(error, CE_WARN,
2979                             "nfs4_mountroot: mount_root failed: %m");
2980                 nfs_rw_exit(&svp->sv_lock);
2981                 sv4_free(svp);
2982                 pn_free(&pn);
2983                 return (error);
2984         }
2985         nfs_rw_exit(&svp->sv_lock);
2986         svp->sv_hostnamelen = (int)(strlen(root_hostname) + 1);
2987         svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
2988         (void) strcpy(svp->sv_hostname, root_hostname);
2989
2990         svp->sv_pathlen = (int)(strlen(root_path) + 1);
2991         svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
2992         (void) strcpy(svp->sv_path, root_path);
2993
2994         /*
2995          * Force root partition to always be mounted with AUTH_UNIX for now
2996          */
2997         svp->sv_secdata = kmem_alloc(sizeof (*svp->sv_secdata), KM_SLEEP);
2998         svp->sv_secdata->secmod = AUTH_UNIX;
2999         svp->sv_secdata->rpcflavor = AUTH_UNIX;
3000         svp->sv_secdata->data = NULL;
3001
3002         cr = crgetcred();
3003         rtvp = NULL;
3004
3005         error = nfs4rootvp(&rtvp, vfsp, svp, args.flags, cr, global_zone);
3006
3007         if (error) {
3008                 crfree(cr);
3009                 pn_free(&pn);
3010                 sv4_free(svp);
3011                 return (error);
3012         }
3013
3014         mi = VTOMI4(rtvp);
3015
3016         /*
3017          * Send client id to the server, if necessary
3018          */
3019         nfs4_error_zinit(&n4e);
3020         nfs4setclientid(mi, cr, FALSE, &n4e);
3021         error = n4e.error;
3022
3023         crfree(cr);
3024
3025         if (error) {
3026                 pn_free(&pn);
3027                 goto errout;
3028         }
3029
3030         error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, &args);
3031         if (error) {
3032                 nfs_cmn_err(error, CE_WARN,
3033                     "nfs4_mountroot: invalid root mount options");
3034                 pn_free(&pn);
3035                 goto errout;
3036         }
3037
3038         (void) vfs_lock_wait(vfsp);
3039         vfs_add(NULL, vfsp, vfsflags);
3040         vfs_unlock(vfsp);
3041
3042         size = strlen(svp->sv_hostname);
3043         (void) strcpy(rootfs.bo_name, svp->sv_hostname);
3044         rootfs.bo_name[size] = ':';
3045         (void) strcpy(&rootfs.bo_name[size + 1], root_path);
3046
3047         pn_free(&pn);
3048
3049 errout:
3050         if (error) {
3051                 sv4_free(svp);
3052                 nfs4_async_stop(vfsp);
3053                 nfs4_async_manager_stop(vfsp);
3054         }
3055
3056         if (rtvp != NULL)
3057                 VN_RELE(rtvp);
3058
3059         return (error);
3060 }
3061
3062 /*
3063  * Initialization routine for VFS routines.  Should only be called once
3064  */
3065 int
3066 nfs4_vfsinit(void)
3067 {
3068         mutex_init(&nfs4_syncbusy, NULL, MUTEX_DEFAULT, NULL);
3069         nfs4setclientid_init();
3070         nfs4_ephemeral_init();
3071         return (0);
3072 }
3073
3074 void
3075 nfs4_vfsfini(void)
3076 {
3077         nfs4_ephemeral_fini();
3078         nfs4setclientid_fini();
3079         mutex_destroy(&nfs4_syncbusy);
3080 }
3081
3082 void
3083 nfs4_freevfs(vfs_t *vfsp)
3084 {
3085         mntinfo4_t *mi;
3086
3087         /* need to release the initial hold */
3088         mi = VFTOMI4(vfsp);
3089
3090         /*
3091          * At this point, we can no longer reference the vfs
3092          * and need to inform other holders of the reference
3093          * to the mntinfo4_t.
3094          */
3095         mi->mi_vfsp = NULL;
3096
3097         MI4_RELE(mi);
3098 }
3099
3100 /*
3101  * Client side SETCLIENTID and SETCLIENTID_CONFIRM
3102  */
3103 struct nfs4_server nfs4_server_lst =
3104         { &nfs4_server_lst, &nfs4_server_lst };
3105
3106 kmutex_t nfs4_server_lst_lock;
3107
3108 static void
3109 nfs4setclientid_init(void)
3110 {
3111         mutex_init(&nfs4_server_lst_lock, NULL, MUTEX_DEFAULT, NULL);
3112 }
3113
3114 static void
3115 nfs4setclientid_fini(void)
3116 {
3117         mutex_destroy(&nfs4_server_lst_lock);
3118 }
3119
3120 int nfs4_retry_sclid_delay = NFS4_RETRY_SCLID_DELAY;
3121 int nfs4_num_sclid_retries = NFS4_NUM_SCLID_RETRIES;
3122
3123 /*
3124  * Set the clientid for the server for "mi".  No-op if the clientid is
3125  * already set.
3126  *
3127  * The recovery boolean should be set to TRUE if this function was called
3128  * by the recovery code, and FALSE otherwise.  This is used to determine
3129  * if we need to call nfs4_start/end_op as well as grab the mi_recovlock
3130  * for adding a mntinfo4_t to a nfs4_server_t.
3131  *
3132  * Error is returned via 'n4ep'.  If there was a 'n4ep->stat' error, then
3133  * 'n4ep->error' is set to geterrno4(n4ep->stat).
3134  */
3135 void
3136 nfs4setclientid(mntinfo4_t *mi, cred_t *cr, bool_t recovery, nfs4_error_t *n4ep)
3137 {
3138         struct nfs4_server *np;
3139         struct servinfo4 *svp = mi->mi_curr_serv;
3140         nfs4_recov_state_t recov_state;
3141         int num_retries = 0;
3142         bool_t retry;
3143         cred_t *lcr = NULL;
3144         int retry_inuse = 1; /* only retry once on NFS4ERR_CLID_INUSE */
3145         time_t lease_time = 0;
3146
3147         recov_state.rs_flags = 0;
3148         recov_state.rs_num_retry_despite_err = 0;
3149         ASSERT(n4ep != NULL);
3150
3151 recov_retry:
3152         retry = FALSE;
3153         nfs4_error_zinit(n4ep);
3154         if (!recovery)
3155                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3156
3157         mutex_enter(&nfs4_server_lst_lock);
3158         np = servinfo4_to_nfs4_server(svp); /* This locks np if it is found */
3159         mutex_exit(&nfs4_server_lst_lock);
3160         if (!np) {
3161                 struct nfs4_server *tnp;
3162                 np = new_nfs4_server(svp, cr);
3163                 mutex_enter(&np->s_lock);
3164
3165                 mutex_enter(&nfs4_server_lst_lock);
3166                 tnp = servinfo4_to_nfs4_server(svp);
3167                 if (tnp) {
3168                         /*
3169                          * another thread snuck in and put server on list.
3170                          * since we aren't adding it to the nfs4_server_list
3171                          * we need to set the ref count to 0 and destroy it.
3172                          */
3173                         np->s_refcnt = 0;
3174                         destroy_nfs4_server(np);
3175                         np = tnp;
3176                 } else {
3177                         /*
3178                          * do not give list a reference until everything
3179                          * succeeds
3180                          */
3181                         insque(np, &nfs4_server_lst);
3182                 }
3183                 mutex_exit(&nfs4_server_lst_lock);
3184         }
3185         ASSERT(MUTEX_HELD(&np->s_lock));
3186         /*
3187          * If we find the server already has N4S_CLIENTID_SET, then
3188          * just return, we've already done SETCLIENTID to that server
3189          */
3190         if (np->s_flags & N4S_CLIENTID_SET) {
3191                 /* add mi to np's mntinfo4_list */
3192                 nfs4_add_mi_to_server(np, mi);
3193                 if (!recovery)
3194                         nfs_rw_exit(&mi->mi_recovlock);
3195                 mutex_exit(&np->s_lock);
3196                 nfs4_server_rele(np);
3197                 return;
3198         }
3199         mutex_exit(&np->s_lock);
3200
3201
3202         /*
3203          * Drop the mi_recovlock since nfs4_start_op will
3204          * acquire it again for us.
3205          */
3206         if (!recovery) {
3207                 nfs_rw_exit(&mi->mi_recovlock);
3208
3209                 n4ep->error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3210                 if (n4ep->error) {
3211                         nfs4_server_rele(np);
3212                         return;
3213                 }
3214         }
3215
3216         mutex_enter(&np->s_lock);
3217         while (np->s_flags & N4S_CLIENTID_PEND) {
3218                 if (!cv_wait_sig(&np->s_clientid_pend, &np->s_lock)) {
3219                         mutex_exit(&np->s_lock);
3220                         nfs4_server_rele(np);
3221                         if (!recovery)
3222                                 nfs4_end_op(mi, NULL, NULL, &recov_state,
3223                                     recovery);
3224                         n4ep->error = EINTR;
3225                         return;
3226                 }
3227         }
3228
3229         if (np->s_flags & N4S_CLIENTID_SET) {
3230                 /* XXX copied/pasted from above */
3231                 /* add mi to np's mntinfo4_list */
3232                 nfs4_add_mi_to_server(np, mi);
3233                 mutex_exit(&np->s_lock);
3234                 nfs4_server_rele(np);
3235                 if (!recovery)
3236                         nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3237                 return;
3238         }
3239
3240         /*
3241          * Reset the N4S_CB_PINGED flag. This is used to
3242          * indicate if we have received a CB_NULL from the
3243          * server. Also we reset the waiter flag.
3244          */
3245         np->s_flags &= ~(N4S_CB_PINGED | N4S_CB_WAITER);
3246         /* any failure must now clear this flag */
3247         np->s_flags |= N4S_CLIENTID_PEND;
3248         mutex_exit(&np->s_lock);
3249         nfs4setclientid_otw(mi, svp, cr, np, n4ep, &retry_inuse);
3250
3251         if (n4ep->error == EACCES) {
3252                 /*
3253                  * If the uid is set then set the creds for secure mounts
3254                  * by proxy processes such as automountd.
3255                  */
3256                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3257                 if (svp->sv_secdata->uid != 0) {
3258                         lcr = crdup(cr);
3259                         (void) crsetugid(lcr, svp->sv_secdata->uid,
3260                             crgetgid(cr));
3261                 }
3262                 nfs_rw_exit(&svp->sv_lock);
3263
3264                 if (lcr != NULL) {
3265                         mutex_enter(&np->s_lock);
3266                         crfree(np->s_cred);
3267                         np->s_cred = lcr;
3268                         mutex_exit(&np->s_lock);
3269                         nfs4setclientid_otw(mi, svp, lcr, np, n4ep,
3270                             &retry_inuse);
3271                 }
3272         }
3273         mutex_enter(&np->s_lock);
3274         lease_time = np->s_lease_time;
3275         np->s_flags &= ~N4S_CLIENTID_PEND;
3276         mutex_exit(&np->s_lock);
3277
3278         if (n4ep->error != 0 || n4ep->stat != NFS4_OK) {
3279                 /*
3280                  * Start recovery if failover is a possibility.  If
3281                  * invoked by the recovery thread itself, then just
3282                  * return and let it handle the failover first.  NB:
3283                  * recovery is not allowed if the mount is in progress
3284                  * since the infrastructure is not sufficiently setup
3285                  * to allow it.  Just return the error (after suitable
3286                  * retries).
3287                  */
3288                 if (FAILOVER_MOUNT4(mi) && nfs4_try_failover(n4ep)) {
3289                         (void) nfs4_start_recovery(n4ep, mi, NULL,
3290                             NULL, NULL, NULL, OP_SETCLIENTID, NULL, NULL, NULL);
3291                         /*
3292                          * Don't retry here, just return and let
3293                          * recovery take over.
3294                          */
3295                         if (recovery)
3296                                 retry = FALSE;
3297                 } else if (nfs4_rpc_retry_error(n4ep->error) ||
3298                     n4ep->stat == NFS4ERR_RESOURCE ||
3299                     n4ep->stat == NFS4ERR_STALE_CLIENTID) {
3300
3301                         retry = TRUE;
3302                         /*
3303                          * Always retry if in recovery or once had
3304                          * contact with the server (but now it's
3305                          * overloaded).
3306                          */
3307                         if (recovery == TRUE ||
3308                             n4ep->error == ETIMEDOUT ||
3309                             n4ep->error == ECONNRESET)
3310                                 num_retries = 0;
3311                 } else if (retry_inuse && n4ep->error == 0 &&
3312                     n4ep->stat == NFS4ERR_CLID_INUSE) {
3313                         retry = TRUE;
3314                         num_retries = 0;
3315                 }
3316         } else {
3317                 /*
3318                  * Since everything succeeded give the list a reference count if
3319                  * it hasn't been given one by add_new_nfs4_server() or if this
3320                  * is not a recovery situation in which case it is already on
3321                  * the list.
3322                  */
3323                 mutex_enter(&np->s_lock);
3324                 if ((np->s_flags & N4S_INSERTED) == 0) {
3325                         np->s_refcnt++;
3326                         np->s_flags |= N4S_INSERTED;
3327                 }
3328                 mutex_exit(&np->s_lock);
3329         }
3330
3331         if (!recovery)
3332                 nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3333
3334
3335         if (retry && num_retries++ < nfs4_num_sclid_retries) {
3336                 if (retry_inuse) {
3337                         ddi_sleep(lease_time + nfs4_retry_sclid_delay);
3338                         retry_inuse = 0;
3339                 } else
3340                         ddi_sleep(nfs4_retry_sclid_delay);
3341
3342                 nfs4_server_rele(np);
3343                 goto recov_retry;
3344         }
3345
3346
3347         if (n4ep->error == 0)
3348                 n4ep->error = geterrno4(n4ep->stat);
3349
3350         /* broadcast before release in case no other threads are waiting */
3351         cv_broadcast(&np->s_clientid_pend);
3352         nfs4_server_rele(np);
3353 }
3354
3355 int nfs4setclientid_otw_debug = 0;
3356
3357 /*
3358  * This function handles the recovery of STALE_CLIENTID for SETCLIENTID_CONFRIM,
3359  * but nothing else; the calling function must be designed to handle those
3360  * other errors.
3361  */
3362 static void
3363 nfs4setclientid_otw(mntinfo4_t *mi, struct servinfo4 *svp,  cred_t *cr,
3364     struct nfs4_server *np, nfs4_error_t *ep, int *retry_inusep)
3365 {
3366         COMPOUND4args_clnt args;
3367         COMPOUND4res_clnt res;
3368         nfs_argop4 argop[3];
3369         SETCLIENTID4args *s_args;
3370         SETCLIENTID4resok *s_resok;
3371         int doqueue = 1;
3372         nfs4_ga_res_t *garp = NULL;
3373         timespec_t prop_time, after_time;
3374         verifier4 verf;
3375         clientid4 tmp_clientid;
3376
3377         ASSERT(!MUTEX_HELD(&np->s_lock));
3378
3379         args.ctag = TAG_SETCLIENTID;
3380
3381         args.array = argop;
3382         args.array_len = 3;
3383
3384         /* PUTROOTFH */
3385         argop[0].argop = OP_PUTROOTFH;
3386
3387         /* GETATTR */
3388         argop[1].argop = OP_GETATTR;
3389         argop[1].nfs_argop4_u.opgetattr.attr_request = FATTR4_LEASE_TIME_MASK;
3390         argop[1].nfs_argop4_u.opgetattr.mi = mi;
3391
3392         /* SETCLIENTID */
3393         argop[2].argop = OP_SETCLIENTID;
3394
3395         s_args = &argop[2].nfs_argop4_u.opsetclientid;
3396
3397         mutex_enter(&np->s_lock);
3398
3399         s_args->client.verifier = np->clidtosend.verifier;
3400         s_args->client.id_len = np->clidtosend.id_len;
3401         ASSERT(s_args->client.id_len <= NFS4_OPAQUE_LIMIT);
3402         s_args->client.id_val = np->clidtosend.id_val;
3403
3404         /*
3405          * Callback needs to happen on non-RDMA transport
3406          * Check if we have saved the original knetconfig
3407          * if so, use that instead.
3408          */
3409         if (svp->sv_origknconf != NULL)
3410                 nfs4_cb_args(np, svp->sv_origknconf, s_args);
3411         else
3412                 nfs4_cb_args(np, svp->sv_knconf, s_args);
3413
3414         mutex_exit(&np->s_lock);
3415
3416         rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
3417
3418         if (ep->error)
3419                 return;
3420
3421         /* getattr lease_time res */
3422         if ((res.array_len >= 2) &&
3423             (res.array[1].nfs_resop4_u.opgetattr.status == NFS4_OK)) {
3424                 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
3425
3426 #ifndef _LP64
3427                 /*
3428                  * The 32 bit client cannot handle a lease time greater than
3429                  * (INT32_MAX/1000000).  This is due to the use of the
3430                  * lease_time in calls to drv_usectohz() in
3431                  * nfs4_renew_lease_thread().  The problem is that
3432                  * drv_usectohz() takes a time_t (which is just a long = 4
3433                  * bytes) as its parameter.  The lease_time is multiplied by
3434                  * 1000000 to convert seconds to usecs for the parameter.  If
3435                  * a number bigger than (INT32_MAX/1000000) is used then we
3436                  * overflow on the 32bit client.
3437                  */
3438                 if (garp->n4g_ext_res->n4g_leasetime > (INT32_MAX/1000000)) {
3439                         garp->n4g_ext_res->n4g_leasetime = INT32_MAX/1000000;
3440                 }
3441 #endif
3442
3443                 mutex_enter(&np->s_lock);
3444                 np->s_lease_time = garp->n4g_ext_res->n4g_leasetime;
3445
3446                 /*
3447                  * Keep track of the lease period for the mi's
3448                  * mi_msg_list.  We need an appropiate time
3449                  * bound to associate past facts with a current
3450                  * event.  The lease period is perfect for this.
3451                  */
3452                 mutex_enter(&mi->mi_msg_list_lock);
3453                 mi->mi_lease_period = np->s_lease_time;
3454                 mutex_exit(&mi->mi_msg_list_lock);
3455                 mutex_exit(&np->s_lock);
3456         }
3457
3458
3459         if (res.status == NFS4ERR_CLID_INUSE) {
3460                 clientaddr4 *clid_inuse;
3461
3462                 if (!(*retry_inusep)) {
3463                         clid_inuse = &res.array->nfs_resop4_u.
3464                             opsetclientid.SETCLIENTID4res_u.client_using;
3465
3466                         zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3467                             "NFS4 mount (SETCLIENTID failed)."
3468                             "  nfs4_client_id.id is in"
3469                             "use already by: r_netid<%s> r_addr<%s>",
3470                             clid_inuse->r_netid, clid_inuse->r_addr);
3471                 }
3472
3473                 /*
3474                  * XXX - The client should be more robust in its
3475                  * handling of clientid in use errors (regen another
3476                  * clientid and try again?)
3477                  */
3478                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3479                 return;
3480         }
3481
3482         if (res.status) {
3483                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3484                 return;
3485         }
3486
3487         s_resok = &res.array[2].nfs_resop4_u.
3488             opsetclientid.SETCLIENTID4res_u.resok4;
3489
3490         tmp_clientid = s_resok->clientid;
3491
3492         verf = s_resok->setclientid_confirm;
3493
3494 #ifdef  DEBUG
3495         if (nfs4setclientid_otw_debug) {
3496                 union {
3497                         clientid4       clientid;
3498                         int             foo[2];
3499                 } cid;
3500
3501                 cid.clientid = s_resok->clientid;
3502
3503                 zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3504                 "nfs4setclientid_otw: OK, clientid = %x,%x, "
3505                 "verifier = %" PRIx64 "\n", cid.foo[0], cid.foo[1], verf);
3506         }
3507 #endif
3508
3509         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3510
3511         /* Confirm the client id and get the lease_time attribute */
3512
3513         args.ctag = TAG_SETCLIENTID_CF;
3514
3515         args.array = argop;
3516         args.array_len = 1;
3517
3518         argop[0].argop = OP_SETCLIENTID_CONFIRM;
3519
3520         argop[0].nfs_argop4_u.opsetclientid_confirm.clientid = tmp_clientid;
3521         argop[0].nfs_argop4_u.opsetclientid_confirm.setclientid_confirm = verf;
3522
3523         /* used to figure out RTT for np */
3524         gethrestime(&prop_time);
3525
3526         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlientid_otw: "
3527             "start time: %ld sec %ld nsec", prop_time.tv_sec,
3528             prop_time.tv_nsec));
3529
3530         rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
3531
3532         gethrestime(&after_time);
3533         mutex_enter(&np->s_lock);
3534         np->propagation_delay.tv_sec =
3535             MAX(1, after_time.tv_sec - prop_time.tv_sec);
3536         mutex_exit(&np->s_lock);
3537
3538         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlcientid_otw: "
3539             "finish time: %ld sec ", after_time.tv_sec));
3540
3541         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setclientid_otw: "
3542             "propagation delay set to %ld sec",
3543             np->propagation_delay.tv_sec));
3544
3545         if (ep->error)
3546                 return;
3547
3548         if (res.status == NFS4ERR_CLID_INUSE) {
3549                 clientaddr4 *clid_inuse;
3550
3551                 if (!(*retry_inusep)) {
3552                         clid_inuse = &res.array->nfs_resop4_u.
3553                             opsetclientid.SETCLIENTID4res_u.client_using;
3554
3555                         zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3556                             "SETCLIENTID_CONFIRM failed.  "
3557                             "nfs4_client_id.id is in use already by: "
3558                             "r_netid<%s> r_addr<%s>",
3559                             clid_inuse->r_netid, clid_inuse->r_addr);
3560                 }
3561
3562                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3563                 return;
3564         }
3565
3566         if (res.status) {
3567                 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3568                 return;
3569         }
3570
3571         mutex_enter(&np->s_lock);
3572         np->clientid = tmp_clientid;
3573         np->s_flags |= N4S_CLIENTID_SET;
3574
3575         /* Add mi to np's mntinfo4 list */
3576         nfs4_add_mi_to_server(np, mi);
3577
3578         if (np->lease_valid == NFS4_LEASE_NOT_STARTED) {
3579                 /*
3580                  * Start lease management thread.
3581                  * Keep trying until we succeed.
3582                  */
3583
3584                 np->s_refcnt++;         /* pass reference to thread */
3585                 (void) zthread_create(NULL, 0, nfs4_renew_lease_thread, np, 0,
3586                     minclsyspri);
3587         }
3588         mutex_exit(&np->s_lock);
3589
3590         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3591 }
3592
3593 /*
3594  * Add mi to sp's mntinfo4_list if it isn't already in the list.  Makes
3595  * mi's clientid the same as sp's.
3596  * Assumes sp is locked down.
3597  */
3598 void
3599 nfs4_add_mi_to_server(nfs4_server_t *sp, mntinfo4_t *mi)
3600 {
3601         mntinfo4_t *tmi;
3602         int in_list = 0;
3603
3604         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
3605             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3606         ASSERT(sp != &nfs4_server_lst);
3607         ASSERT(MUTEX_HELD(&sp->s_lock));
3608
3609         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3610             "nfs4_add_mi_to_server: add mi %p to sp %p",
3611             (void*)mi, (void*)sp));
3612
3613         for (tmi = sp->mntinfo4_list;
3614             tmi != NULL;
3615             tmi = tmi->mi_clientid_next) {
3616                 if (tmi == mi) {
3617                         NFS4_DEBUG(nfs4_client_lease_debug,
3618                             (CE_NOTE,
3619                             "nfs4_add_mi_to_server: mi in list"));
3620                         in_list = 1;
3621                 }
3622         }
3623
3624         /*
3625          * First put a hold on the mntinfo4's vfsp so that references via
3626          * mntinfo4_list will be valid.
3627          */
3628         if (!in_list)
3629                 VFS_HOLD(mi->mi_vfsp);
3630
3631         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4_add_mi_to_server: "
3632             "hold vfs %p for mi: %p", (void*)mi->mi_vfsp, (void*)mi));
3633
3634         if (!in_list) {
3635                 if (sp->mntinfo4_list)
3636                         sp->mntinfo4_list->mi_clientid_prev = mi;
3637                 mi->mi_clientid_next = sp->mntinfo4_list;
3638                 mi->mi_srv = sp;
3639                 sp->mntinfo4_list = mi;
3640                 mi->mi_srvsettime = gethrestime_sec();
3641                 mi->mi_srvset_cnt++;
3642         }
3643
3644         /* set mi's clientid to that of sp's for later matching */
3645         mi->mi_clientid = sp->clientid;
3646
3647         /*
3648          * Update the clientid for any other mi's belonging to sp.  This
3649          * must be done here while we hold sp->s_lock, so that
3650          * find_nfs4_server() continues to work.
3651          */
3652
3653         for (tmi = sp->mntinfo4_list;
3654             tmi != NULL;
3655             tmi = tmi->mi_clientid_next) {
3656                 if (tmi != mi) {
3657                         tmi->mi_clientid = sp->clientid;
3658                 }
3659         }
3660 }
3661
3662 /*
3663  * Remove the mi from sp's mntinfo4_list and release its reference.
3664  * Exception: if mi still has open files, flag it for later removal (when
3665  * all the files are closed).
3666  *
3667  * If this is the last mntinfo4 in sp's list then tell the lease renewal
3668  * thread to exit.
3669  */
3670 static void
3671 nfs4_remove_mi_from_server_nolock(mntinfo4_t *mi, nfs4_server_t *sp)
3672 {
3673         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3674             "nfs4_remove_mi_from_server_nolock: remove mi %p from sp %p",
3675             (void*)mi, (void*)sp));
3676
3677         ASSERT(sp != NULL);
3678         ASSERT(MUTEX_HELD(&sp->s_lock));
3679         ASSERT(mi->mi_open_files >= 0);
3680
3681         /*
3682          * First make sure this mntinfo4 can be taken off of the list,
3683          * ie: it doesn't have any open files remaining.
3684          */
3685         if (mi->mi_open_files > 0) {
3686                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3687                     "nfs4_remove_mi_from_server_nolock: don't "
3688                     "remove mi since it still has files open"));
3689
3690                 mutex_enter(&mi->mi_lock);
3691                 mi->mi_flags |= MI4_REMOVE_ON_LAST_CLOSE;
3692                 mutex_exit(&mi->mi_lock);
3693                 return;
3694         }
3695
3696         VFS_HOLD(mi->mi_vfsp);
3697         remove_mi(sp, mi);
3698         VFS_RELE(mi->mi_vfsp);
3699
3700         if (sp->mntinfo4_list == NULL) {
3701                 /* last fs unmounted, kill the thread */
3702                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3703                     "remove_mi_from_nfs4_server_nolock: kill the thread"));
3704                 nfs4_mark_srv_dead(sp);
3705         }
3706 }
3707
3708 /*
3709  * Remove mi from sp's mntinfo4_list and release the vfs reference.
3710  */
3711 static void
3712 remove_mi(nfs4_server_t *sp, mntinfo4_t *mi)
3713 {
3714         ASSERT(MUTEX_HELD(&sp->s_lock));
3715
3716         /*
3717          * We release a reference, and the caller must still have a
3718          * reference.
3719          */
3720         ASSERT(mi->mi_vfsp->vfs_count >= 2);
3721
3722         if (mi->mi_clientid_prev) {
3723                 mi->mi_clientid_prev->mi_clientid_next = mi->mi_clientid_next;
3724         } else {
3725                 /* This is the first mi in sp's mntinfo4_list */
3726                 /*
3727                  * Make sure the first mntinfo4 in the list is the actual
3728                  * mntinfo4 passed in.
3729                  */
3730                 ASSERT(sp->mntinfo4_list == mi);
3731
3732                 sp->mntinfo4_list = mi->mi_clientid_next;
3733         }
3734         if (mi->mi_clientid_next)
3735                 mi->mi_clientid_next->mi_clientid_prev = mi->mi_clientid_prev;
3736
3737         /* Now mark the mntinfo4's links as being removed */
3738         mi->mi_clientid_prev = mi->mi_clientid_next = NULL;
3739         mi->mi_srv = NULL;
3740         mi->mi_srvset_cnt++;
3741
3742         VFS_RELE(mi->mi_vfsp);
3743 }
3744
3745 /*
3746  * Free all the entries in sp's mntinfo4_list.
3747  */
3748 static void
3749 remove_all_mi(nfs4_server_t *sp)
3750 {
3751         mntinfo4_t *mi;
3752
3753         ASSERT(MUTEX_HELD(&sp->s_lock));
3754
3755         while (sp->mntinfo4_list != NULL) {
3756                 mi = sp->mntinfo4_list;
3757                 /*
3758                  * Grab a reference in case there is only one left (which
3759                  * remove_mi() frees).
3760                  */
3761                 VFS_HOLD(mi->mi_vfsp);
3762                 remove_mi(sp, mi);
3763                 VFS_RELE(mi->mi_vfsp);
3764         }
3765 }
3766
3767 /*
3768  * Remove the mi from sp's mntinfo4_list as above, and rele the vfs.
3769  *
3770  * This version can be called with a null nfs4_server_t arg,
3771  * and will either find the right one and handle locking, or
3772  * do nothing because the mi wasn't added to an sp's mntinfo4_list.
3773  */
3774 void
3775 nfs4_remove_mi_from_server(mntinfo4_t *mi, nfs4_server_t *esp)
3776 {
3777         nfs4_server_t   *sp;
3778
3779         if (esp) {
3780                 nfs4_remove_mi_from_server_nolock(mi, esp);
3781                 return;
3782         }
3783
3784         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3785         if (sp = find_nfs4_server_all(mi, 1)) {
3786                 nfs4_remove_mi_from_server_nolock(mi, sp);
3787                 mutex_exit(&sp->s_lock);
3788                 nfs4_server_rele(sp);
3789         }
3790         nfs_rw_exit(&mi->mi_recovlock);
3791 }
3792
3793 /*
3794  * Return TRUE if the given server has any non-unmounted filesystems.
3795  */
3796
3797 bool_t
3798 nfs4_fs_active(nfs4_server_t *sp)
3799 {
3800         mntinfo4_t *mi;
3801
3802         ASSERT(MUTEX_HELD(&sp->s_lock));
3803
3804         for (mi = sp->mntinfo4_list; mi != NULL; mi = mi->mi_clientid_next) {
3805                 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
3806                         return (TRUE);
3807         }
3808
3809         return (FALSE);
3810 }
3811
3812 /*
3813  * Mark sp as finished and notify any waiters.
3814  */
3815
3816 void
3817 nfs4_mark_srv_dead(nfs4_server_t *sp)
3818 {
3819         ASSERT(MUTEX_HELD(&sp->s_lock));
3820
3821         sp->s_thread_exit = NFS4_THREAD_EXIT;
3822         cv_broadcast(&sp->cv_thread_exit);
3823 }
3824
3825 /*
3826  * Create a new nfs4_server_t structure.
3827  * Returns new node unlocked and not in list, but with a reference count of
3828  * 1.
3829  */
3830 struct nfs4_server *
3831 new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3832 {
3833         struct nfs4_server *np;
3834         timespec_t tt;
3835         union {
3836                 struct {
3837                         uint32_t sec;
3838                         uint32_t subsec;
3839                 } un_curtime;
3840                 verifier4       un_verifier;
3841         } nfs4clientid_verifier;
3842         /*
3843          * We change this ID string carefully and with the Solaris
3844          * NFS server behaviour in mind.  "+referrals" indicates
3845          * a client that can handle an NFSv4 referral.
3846          */
3847         char id_val[] = "Solaris: %s, NFSv4 kernel client +referrals";
3848         int len;
3849
3850         np = kmem_zalloc(sizeof (struct nfs4_server), KM_SLEEP);
3851         np->saddr.len = svp->sv_addr.len;
3852         np->saddr.maxlen = svp->sv_addr.maxlen;
3853         np->saddr.buf = kmem_alloc(svp->sv_addr.maxlen, KM_SLEEP);
3854         bcopy(svp->sv_addr.buf, np->saddr.buf, svp->sv_addr.len);
3855         np->s_refcnt = 1;
3856
3857         /*
3858          * Build the nfs_client_id4 for this server mount.  Ensure
3859          * the verifier is useful and that the identification is
3860          * somehow based on the server's address for the case of
3861          * multi-homed servers.
3862          */
3863         nfs4clientid_verifier.un_verifier = 0;
3864         gethrestime(&tt);
3865         nfs4clientid_verifier.un_curtime.sec = (uint32_t)tt.tv_sec;
3866         nfs4clientid_verifier.un_curtime.subsec = (uint32_t)tt.tv_nsec;
3867         np->clidtosend.verifier = nfs4clientid_verifier.un_verifier;
3868
3869         /*
3870          * calculate the length of the opaque identifier.  Subtract 2
3871          * for the "%s" and add the traditional +1 for null
3872          * termination.
3873          */
3874         len = strlen(id_val) - 2 + strlen(uts_nodename()) + 1;
3875         np->clidtosend.id_len = len + np->saddr.maxlen;
3876
3877         np->clidtosend.id_val = kmem_alloc(np->clidtosend.id_len, KM_SLEEP);
3878         (void) sprintf(np->clidtosend.id_val, id_val, uts_nodename());
3879         bcopy(np->saddr.buf, &np->clidtosend.id_val[len], np->saddr.len);
3880
3881         np->s_flags = 0;
3882         np->mntinfo4_list = NULL;
3883         /* save cred for issuing rfs4calls inside the renew thread */
3884         crhold(cr);
3885         np->s_cred = cr;
3886         cv_init(&np->cv_thread_exit, NULL, CV_DEFAULT, NULL);
3887         mutex_init(&np->s_lock, NULL, MUTEX_DEFAULT, NULL);
3888         nfs_rw_init(&np->s_recovlock, NULL, RW_DEFAULT, NULL);
3889         list_create(&np->s_deleg_list, sizeof (rnode4_t),
3890             offsetof(rnode4_t, r_deleg_link));
3891         np->s_thread_exit = 0;
3892         np->state_ref_count = 0;
3893         np->lease_valid = NFS4_LEASE_NOT_STARTED;
3894         cv_init(&np->s_cv_otw_count, NULL, CV_DEFAULT, NULL);
3895         cv_init(&np->s_clientid_pend, NULL, CV_DEFAULT, NULL);
3896         np->s_otw_call_count = 0;
3897         cv_init(&np->wait_cb_null, NULL, CV_DEFAULT, NULL);
3898         np->zoneid = getzoneid();
3899         np->zone_globals = nfs4_get_callback_globals();
3900         ASSERT(np->zone_globals != NULL);
3901         return (np);
3902 }
3903
3904 /*
3905  * Create a new nfs4_server_t structure and add it to the list.
3906  * Returns new node locked; reference must eventually be freed.
3907  */
3908 static struct nfs4_server *
3909 add_new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3910 {
3911         nfs4_server_t *sp;
3912
3913         ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
3914         sp = new_nfs4_server(svp, cr);
3915         mutex_enter(&sp->s_lock);
3916         insque(sp, &nfs4_server_lst);
3917         sp->s_refcnt++;                 /* list gets a reference */
3918         sp->s_flags |= N4S_INSERTED;
3919         sp->clientid = 0;
3920         return (sp);
3921 }
3922
3923 int nfs4_server_t_debug = 0;
3924
3925
3926 #ifdef DEBUG
3927 void
3928 dumpnfs4slist(char *txt, mntinfo4_t *mi, clientid4 clientid, servinfo4_t *srv_p)
3929 {
3930         int hash16(void *p, int len);
3931         nfs4_server_t *np;
3932
3933         NFS4_DEBUG(nfs4_server_t_debug, (CE_NOTE,
3934             "dumping nfs4_server_t list in %s", txt));
3935         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3936             "mi 0x%p, want clientid %llx, addr %d/%04X",
3937             mi, (longlong_t)clientid, srv_p->sv_addr.len,
3938             hash16((void *)srv_p->sv_addr.buf, srv_p->sv_addr.len)));
3939         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst;
3940             np = np->forw) {
3941                 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3942                     "node 0x%p,    clientid %llx, addr %d/%04X, cnt %d",
3943                     np, (longlong_t)np->clientid, np->saddr.len,
3944                     hash16((void *)np->saddr.buf, np->saddr.len),
3945                     np->state_ref_count));
3946                 if (np->saddr.len == srv_p->sv_addr.len &&
3947                     bcmp(np->saddr.buf, srv_p->sv_addr.buf,
3948                     np->saddr.len) == 0)
3949                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3950                             " - address matches"));
3951                 if (np->clientid == clientid || np->clientid == 0)
3952                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3953                             " - clientid matches"));
3954                 if (np->s_thread_exit != NFS4_THREAD_EXIT)
3955                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3956                             " - thread not exiting"));
3957         }
3958         ddi_sleep(1);
3959 }
3960 #endif
3961
3962
3963 /*
3964  * Move a mntinfo4_t from one server list to another.
3965  * Locking of the two nfs4_server_t nodes will be done in list order.
3966  *
3967  * Returns NULL if the current nfs4_server_t for the filesystem could not
3968  * be found (e.g., due to forced unmount).  Otherwise returns a reference
3969  * to the new nfs4_server_t, which must eventually be freed.
3970  */
3971 nfs4_server_t *
3972 nfs4_move_mi(mntinfo4_t *mi, servinfo4_t *old, servinfo4_t *new)
3973 {
3974         nfs4_server_t *p, *op = NULL, *np = NULL;
3975         int num_open;
3976         zoneid_t zoneid = nfs_zoneid();
3977
3978         ASSERT(nfs_zone() == mi->mi_zone);
3979
3980         mutex_enter(&nfs4_server_lst_lock);
3981 #ifdef DEBUG
3982         if (nfs4_server_t_debug)
3983                 dumpnfs4slist("nfs4_move_mi", mi, (clientid4)0, new);
3984 #endif
3985         for (p = nfs4_server_lst.forw; p != &nfs4_server_lst; p = p->forw) {
3986                 if (p->zoneid != zoneid)
3987                         continue;
3988                 if (p->saddr.len == old->sv_addr.len &&
3989                     bcmp(p->saddr.buf, old->sv_addr.buf, p->saddr.len) == 0 &&
3990                     p->s_thread_exit != NFS4_THREAD_EXIT) {
3991                         op = p;
3992                         mutex_enter(&op->s_lock);
3993                         op->s_refcnt++;
3994                 }
3995                 if (p->saddr.len == new->sv_addr.len &&
3996                     bcmp(p->saddr.buf, new->sv_addr.buf, p->saddr.len) == 0 &&
3997                     p->s_thread_exit != NFS4_THREAD_EXIT) {
3998                         np = p;
3999                         mutex_enter(&np->s_lock);
4000                 }
4001                 if (op != NULL && np != NULL)
4002                         break;
4003         }
4004         if (op == NULL) {
4005                 /*
4006                  * Filesystem has been forcibly unmounted.  Bail out.
4007                  */
4008                 if (np != NULL)
4009                         mutex_exit(&np->s_lock);
4010                 mutex_exit(&nfs4_server_lst_lock);
4011                 return (NULL);
4012         }
4013         if (np != NULL) {
4014                 np->s_refcnt++;
4015         } else {
4016 #ifdef DEBUG
4017                 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4018                     "nfs4_move_mi: no target nfs4_server, will create."));
4019 #endif
4020                 np = add_new_nfs4_server(new, kcred);
4021         }
4022         mutex_exit(&nfs4_server_lst_lock);
4023
4024         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4025             "nfs4_move_mi: for mi 0x%p, "
4026             "old servinfo4 0x%p, new servinfo4 0x%p, "
4027             "old nfs4_server 0x%p, new nfs4_server 0x%p, ",
4028             (void*)mi, (void*)old, (void*)new,
4029             (void*)op, (void*)np));
4030         ASSERT(op != NULL && np != NULL);
4031
4032         /* discard any delegations */
4033         nfs4_deleg_discard(mi, op);
4034
4035         num_open = mi->mi_open_files;
4036         mi->mi_open_files = 0;
4037         op->state_ref_count -= num_open;
4038         ASSERT(op->state_ref_count >= 0);
4039         np->state_ref_count += num_open;
4040         nfs4_remove_mi_from_server_nolock(mi, op);
4041         mi->mi_open_files = num_open;
4042         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4043             "nfs4_move_mi: mi_open_files %d, op->cnt %d, np->cnt %d",
4044             mi->mi_open_files, op->state_ref_count, np->state_ref_count));
4045
4046         nfs4_add_mi_to_server(np, mi);
4047
4048         mutex_exit(&op->s_lock);
4049         mutex_exit(&np->s_lock);
4050         nfs4_server_rele(op);
4051
4052         return (np);
4053 }
4054
4055 /*
4056  * Need to have the nfs4_server_lst_lock.
4057  * Search the nfs4_server list to find a match on this servinfo4
4058  * based on its address.
4059  *
4060  * Returns NULL if no match is found.  Otherwise returns a reference (which
4061  * must eventually be freed) to a locked nfs4_server.
4062  */
4063 nfs4_server_t *
4064 servinfo4_to_nfs4_server(servinfo4_t *srv_p)
4065 {
4066         nfs4_server_t *np;
4067         zoneid_t zoneid = nfs_zoneid();
4068
4069         ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
4070         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4071                 if (np->zoneid == zoneid &&
4072                     np->saddr.len == srv_p->sv_addr.len &&
4073                     bcmp(np->saddr.buf, srv_p->sv_addr.buf,
4074                     np->saddr.len) == 0 &&
4075                     np->s_thread_exit != NFS4_THREAD_EXIT) {
4076                         mutex_enter(&np->s_lock);
4077                         np->s_refcnt++;
4078                         return (np);
4079                 }
4080         }
4081         return (NULL);
4082 }
4083
4084 /*
4085  * Locks the nfs4_server down if it is found and returns a reference that
4086  * must eventually be freed.
4087  */
4088 static nfs4_server_t *
4089 lookup_nfs4_server(nfs4_server_t *sp, int any_state)
4090 {
4091         nfs4_server_t *np;
4092
4093         mutex_enter(&nfs4_server_lst_lock);
4094         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4095                 mutex_enter(&np->s_lock);
4096                 if (np == sp && np->s_refcnt > 0 &&
4097                     (np->s_thread_exit != NFS4_THREAD_EXIT || any_state)) {
4098                         mutex_exit(&nfs4_server_lst_lock);
4099                         np->s_refcnt++;
4100                         return (np);
4101                 }
4102                 mutex_exit(&np->s_lock);
4103         }
4104         mutex_exit(&nfs4_server_lst_lock);
4105
4106         return (NULL);
4107 }
4108
4109 /*
4110  * The caller should be holding mi->mi_recovlock, and it should continue to
4111  * hold the lock until done with the returned nfs4_server_t.  Once
4112  * mi->mi_recovlock is released, there is no guarantee that the returned
4113  * mi->nfs4_server_t will continue to correspond to mi.
4114  */
4115 nfs4_server_t *
4116 find_nfs4_server(mntinfo4_t *mi)
4117 {
4118         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4119             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4120
4121         return (lookup_nfs4_server(mi->mi_srv, 0));
4122 }
4123
4124 /*
4125  * Same as above, but takes an "any_state" parameter which can be
4126  * set to 1 if the caller wishes to find nfs4_server_t's which
4127  * have been marked for termination by the exit of the renew
4128  * thread.  This should only be used by operations which are
4129  * cleaning up and will not cause an OTW op.
4130  */
4131 nfs4_server_t *
4132 find_nfs4_server_all(mntinfo4_t *mi, int any_state)
4133 {
4134         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4135             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4136
4137         return (lookup_nfs4_server(mi->mi_srv, any_state));
4138 }
4139
4140 /*
4141  * Lock sp, but only if it's still active (in the list and hasn't been
4142  * flagged as exiting) or 'any_state' is non-zero.
4143  * Returns TRUE if sp got locked and adds a reference to sp.
4144  */
4145 bool_t
4146 nfs4_server_vlock(nfs4_server_t *sp, int any_state)
4147 {
4148         return (lookup_nfs4_server(sp, any_state) != NULL);
4149 }
4150
4151 /*
4152  * Release the reference to sp and destroy it if that's the last one.
4153  */
4154
4155 void
4156 nfs4_server_rele(nfs4_server_t *sp)
4157 {
4158         mutex_enter(&sp->s_lock);
4159         ASSERT(sp->s_refcnt > 0);
4160         sp->s_refcnt--;
4161         if (sp->s_refcnt > 0) {
4162                 mutex_exit(&sp->s_lock);
4163                 return;
4164         }
4165         mutex_exit(&sp->s_lock);
4166
4167         mutex_enter(&nfs4_server_lst_lock);
4168         mutex_enter(&sp->s_lock);
4169         if (sp->s_refcnt > 0) {
4170                 mutex_exit(&sp->s_lock);
4171                 mutex_exit(&nfs4_server_lst_lock);
4172                 return;
4173         }
4174         remque(sp);
4175         sp->forw = sp->back = NULL;
4176         mutex_exit(&nfs4_server_lst_lock);
4177         destroy_nfs4_server(sp);
4178 }
4179
4180 static void
4181 destroy_nfs4_server(nfs4_server_t *sp)
4182 {
4183         ASSERT(MUTEX_HELD(&sp->s_lock));
4184         ASSERT(sp->s_refcnt == 0);
4185         ASSERT(sp->s_otw_call_count == 0);
4186
4187         remove_all_mi(sp);
4188
4189         crfree(sp->s_cred);
4190         kmem_free(sp->saddr.buf, sp->saddr.maxlen);
4191         kmem_free(sp->clidtosend.id_val, sp->clidtosend.id_len);
4192         mutex_exit(&sp->s_lock);
4193
4194         /* destroy the nfs4_server */
4195         nfs4callback_destroy(sp);
4196         list_destroy(&sp->s_deleg_list);
4197         mutex_destroy(&sp->s_lock);
4198         cv_destroy(&sp->cv_thread_exit);
4199         cv_destroy(&sp->s_cv_otw_count);
4200         cv_destroy(&sp->s_clientid_pend);
4201         cv_destroy(&sp->wait_cb_null);
4202         nfs_rw_destroy(&sp->s_recovlock);
4203         kmem_free(sp, sizeof (*sp));
4204 }
4205
4206 /*
4207  * Fork off a thread to free the data structures for a mount.
4208  */
4209
4210 static void
4211 async_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4212 {
4213         freemountargs_t *args;
4214         args = kmem_alloc(sizeof (freemountargs_t), KM_SLEEP);
4215         args->fm_vfsp = vfsp;
4216         VFS_HOLD(vfsp);
4217         MI4_HOLD(VFTOMI4(vfsp));
4218         args->fm_flag = flag;
4219         args->fm_cr = cr;
4220         crhold(cr);
4221         (void) zthread_create(NULL, 0, nfs4_free_mount_thread, args, 0,
4222             minclsyspri);
4223 }
4224
4225 static void
4226 nfs4_free_mount_thread(freemountargs_t *args)
4227 {
4228         mntinfo4_t *mi;
4229         nfs4_free_mount(args->fm_vfsp, args->fm_flag, args->fm_cr);
4230         mi = VFTOMI4(args->fm_vfsp);
4231         crfree(args->fm_cr);
4232         VFS_RELE(args->fm_vfsp);
4233         MI4_RELE(mi);
4234         kmem_free(args, sizeof (freemountargs_t));
4235         zthread_exit();
4236         /* NOTREACHED */
4237 }
4238
4239 /*
4240  * Thread to free the data structures for a given filesystem.
4241  */
4242 static void
4243 nfs4_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4244 {
4245         mntinfo4_t              *mi = VFTOMI4(vfsp);
4246         nfs4_server_t           *sp;
4247         callb_cpr_t             cpr_info;
4248         kmutex_t                cpr_lock;
4249         boolean_t               async_thread;
4250         int                     removed;
4251
4252         bool_t                  must_unlock;
4253         nfs4_ephemeral_tree_t   *eph_tree;
4254
4255         /*
4256          * We need to participate in the CPR framework if this is a kernel
4257          * thread.
4258          */
4259         async_thread = (curproc == nfs_zone()->zone_zsched);
4260         if (async_thread) {
4261                 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
4262                 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
4263                     "nfsv4AsyncUnmount");
4264         }
4265
4266         /*
4267          * We need to wait for all outstanding OTW calls
4268          * and recovery to finish before we remove the mi
4269          * from the nfs4_server_t, as current pending
4270          * calls might still need this linkage (in order
4271          * to find a nfs4_server_t from a mntinfo4_t).
4272          */
4273         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
4274         sp = find_nfs4_server(mi);
4275         nfs_rw_exit(&mi->mi_recovlock);
4276
4277         if (sp) {
4278                 while (sp->s_otw_call_count != 0) {
4279                         if (async_thread) {
4280                                 mutex_enter(&cpr_lock);
4281                                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
4282                                 mutex_exit(&cpr_lock);
4283                         }
4284                         cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
4285                         if (async_thread) {
4286                                 mutex_enter(&cpr_lock);
4287                                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4288                                 mutex_exit(&cpr_lock);
4289                         }
4290                 }
4291                 mutex_exit(&sp->s_lock);
4292                 nfs4_server_rele(sp);
4293                 sp = NULL;
4294         }
4295
4296         mutex_enter(&mi->mi_lock);
4297         while (mi->mi_in_recovery != 0) {
4298                 if (async_thread) {
4299                         mutex_enter(&cpr_lock);
4300                         CALLB_CPR_SAFE_BEGIN(&cpr_info);
4301                         mutex_exit(&cpr_lock);
4302                 }
4303                 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
4304                 if (async_thread) {
4305                         mutex_enter(&cpr_lock);
4306                         CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4307                         mutex_exit(&cpr_lock);
4308                 }
4309         }
4310         mutex_exit(&mi->mi_lock);
4311
4312         /*
4313          * If we got an error, then do not nuke the
4314          * tree. Either the harvester is busy reclaiming
4315          * this node or we ran into some busy condition.
4316          *
4317          * The harvester will eventually come along and cleanup.
4318          * The only problem would be the root mount point.
4319          *
4320          * Since the busy node can occur for a variety
4321          * of reasons and can result in an entry staying
4322          * in df output but no longer accessible from the
4323          * directory tree, we are okay.
4324          */
4325         if (!nfs4_ephemeral_umount(mi, flag, cr,
4326             &must_unlock, &eph_tree))
4327                 nfs4_ephemeral_umount_activate(mi, &must_unlock,
4328                     &eph_tree);
4329
4330         /*
4331          * The original purge of the dnlc via 'dounmount'
4332          * doesn't guarantee that another dnlc entry was not
4333          * added while we waitied for all outstanding OTW
4334          * and recovery calls to finish.  So re-purge the
4335          * dnlc now.
4336          */
4337         (void) dnlc_purge_vfsp(vfsp, 0);
4338
4339         /*
4340          * We need to explicitly stop the manager thread; the asyc worker
4341          * threads can timeout and exit on their own.
4342          */
4343         mutex_enter(&mi->mi_async_lock);
4344         mi->mi_max_threads = 0;
4345         NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
4346         mutex_exit(&mi->mi_async_lock);
4347         if (mi->mi_manager_thread)
4348                 nfs4_async_manager_stop(vfsp);
4349
4350         destroy_rtable4(vfsp, cr);
4351
4352         nfs4_remove_mi_from_server(mi, NULL);
4353
4354         if (async_thread) {
4355                 mutex_enter(&cpr_lock);
4356                 CALLB_CPR_EXIT(&cpr_info);      /* drops cpr_lock */
4357                 mutex_destroy(&cpr_lock);
4358         }
4359
4360         removed = nfs4_mi_zonelist_remove(mi);
4361         if (removed)
4362                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
4363 }
4364
4365 /* Referral related sub-routines */
4366
4367 /* Freeup knetconfig */
4368 static void
4369 free_knconf_contents(struct knetconfig *k)
4370 {
4371         if (k == NULL)
4372                 return;
4373         if (k->knc_protofmly)
4374                 kmem_free(k->knc_protofmly, KNC_STRSIZE);
4375         if (k->knc_proto)
4376                 kmem_free(k->knc_proto, KNC_STRSIZE);
4377 }
4378
4379 /*
4380  * This updates newpath variable with exact name component from the
4381  * path which gave us a NFS4ERR_MOVED error.
4382  * If the path is /rp/aaa/bbb and nth value is 1, aaa is returned.
4383  */
4384 static char *
4385 extract_referral_point(const char *svp, int nth)
4386 {
4387         int num_slashes = 0;
4388         const char *p;
4389         char *newpath = NULL;
4390         int i = 0;
4391
4392         newpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4393         for (p = svp; *p; p++) {
4394                 if (*p == '/')
4395                         num_slashes++;
4396                 if (num_slashes == nth + 1) {
4397                         p++;
4398                         while (*p != '/') {
4399                                 if (*p == '\0')
4400                                         break;
4401                                 newpath[i] = *p;
4402                                 i++;
4403                                 p++;
4404                         }
4405                         newpath[i++] = '\0';
4406                         break;
4407                 }
4408         }
4409         return (newpath);
4410 }
4411
4412 /*
4413  * This sets up a new path in sv_path to do a lookup of the referral point.
4414  * If the path is /rp/aaa/bbb and the referral point is aaa,
4415  * this updates /rp/aaa. This path will be used to get referral
4416  * location.
4417  */
4418 static void
4419 setup_newsvpath(servinfo4_t *svp, int nth)
4420 {
4421         int num_slashes = 0, pathlen, i = 0;
4422         char *newpath, *p;
4423
4424         newpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4425         for (p = svp->sv_path; *p; p++) {
4426                 newpath[i] =  *p;
4427                 if (*p == '/')
4428                         num_slashes++;
4429                 if (num_slashes == nth + 1) {
4430                         newpath[i] = '\0';
4431                         pathlen = strlen(newpath) + 1;
4432                         kmem_free(svp->sv_path, svp->sv_pathlen);
4433                         svp->sv_path = kmem_alloc(pathlen, KM_SLEEP);
4434                         svp->sv_pathlen = pathlen;
4435                         bcopy(newpath, svp->sv_path, pathlen);
4436                         break;
4437                 }
4438                 i++;
4439         }
4440         kmem_free(newpath, MAXPATHLEN);
4441 }