kernel/fs/nfs/nfs4_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  25  */
  26
  27 /*
  28  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  29  *      All Rights Reserved
  30  */
  31
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/cred.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vfs_opreg.h>
  38 #include <sys/vnode.h>
  39 #include <sys/pathname.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/kmem.h>
  42 #include <sys/mkdev.h>
  43 #include <sys/mount.h>
  44 #include <sys/statvfs.h>
  45 #include <sys/errno.h>
  46 #include <sys/debug.h>
  47 #include <sys/cmn_err.h>
  48 #include <sys/utsname.h>
  49 #include <sys/bootconf.h>
  50 #include <sys/modctl.h>
  51 #include <sys/acl.h>
  52 #include <sys/flock.h>
  53 #include <sys/time.h>
  54 #include <sys/disp.h>
  55 #include <sys/policy.h>
  56 #include <sys/socket.h>
  57 #include <sys/netconfig.h>
  58 #include <sys/dnlc.h>
  59 #include <sys/list.h>
  60 #include <sys/mntent.h>
  61
  62 #include <rpc/types.h>
  63 #include <rpc/auth.h>
  64 #include <rpc/rpcsec_gss.h>
  65 #include <rpc/clnt.h>
  66
  67 #include <nfs/nfs.h>
  68 #include <nfs/nfs_clnt.h>
  69 #include <nfs/mount.h>
  70 #include <nfs/nfs_acl.h>
  71
  72 #include <sys/fs_subr.h>
  73
  74 #include <nfs/nfs4.h>
  75 #include <nfs/rnode4.h>
  76 #include <nfs/nfs4_clnt.h>
  77 #include <sys/fs/autofs.h>
  78
  79 #include <sys/sdt.h>
  80
  81
  82 /*
  83  * Arguments passed to thread to free data structures from forced unmount.
  84  */
  85
  86 typedef struct {
  87         vfs_t   *fm_vfsp;
  88         int     fm_flag;
  89         cred_t  *fm_cr;
  90 } freemountargs_t;
  91
  92 static void     async_free_mount(vfs_t *, int, cred_t *);
  93 static void     nfs4_free_mount(vfs_t *, int, cred_t *);
  94 static void     nfs4_free_mount_thread(freemountargs_t *);
  95 static int nfs4_chkdup_servinfo4(servinfo4_t *, servinfo4_t *);
  96
  97 /*
  98  * From rpcsec module (common/rpcsec).
  99  */
 100 extern int sec_clnt_loadinfo(struct sec_data *, struct sec_data **, model_t);
 101 extern void sec_clnt_freeinfo(struct sec_data *);
 102
 103 /*
 104  * The order and contents of this structure must be kept in sync with that of
 105  * rfsreqcnt_v4_tmpl in nfs_stats.c
 106  */
 107 static char *rfsnames_v4[] = {
 108         "null", "compound", "reserved", "access", "close", "commit", "create",
 109         "delegpurge", "delegreturn", "getattr", "getfh", "link", "lock",
 110         "lockt", "locku", "lookup", "lookupp", "nverify", "open", "openattr",
 111         "open_confirm", "open_downgrade", "putfh", "putpubfh", "putrootfh",
 112         "read", "readdir", "readlink", "remove", "rename", "renew",
 113         "restorefh", "savefh", "secinfo", "setattr", "setclientid",
 114         "setclientid_confirm", "verify", "write"
 115 };
 116
 117 /*
 118  * nfs4_max_mount_retry is the number of times the client will redrive
 119  * a mount compound before giving up and returning failure.  The intent
 120  * is to redrive mount compounds which fail NFS4ERR_STALE so that
 121  * if a component of the server path being mounted goes stale, it can
 122  * "recover" by redriving the mount compund (LOOKUP ops).  This recovery
 123  * code is needed outside of the recovery framework because mount is a
 124  * special case.  The client doesn't create vnodes/rnodes for components
 125  * of the server path being mounted.  The recovery code recovers real
 126  * client objects, not STALE FHs which map to components of the server
 127  * path being mounted.
 128  *
 129  * We could just fail the mount on the first time, but that would
 130  * instantly trigger failover (from nfs4_mount), and the client should
 131  * try to re-lookup the STALE FH before doing failover.  The easiest
 132  * way to "re-lookup" is to simply redrive the mount compound.
 133  */
 134 static int nfs4_max_mount_retry = 2;
 135
 136 /*
 137  * nfs4 vfs operations.
 138  */
 139 int             nfs4_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
 140 static int      nfs4_unmount(vfs_t *, int, cred_t *);
 141 static int      nfs4_root(vfs_t *, vnode_t **);
 142 static int      nfs4_statvfs(vfs_t *, struct statvfs64 *);
 143 static int      nfs4_sync(vfs_t *, short, cred_t *);
 144 static int      nfs4_vget(vfs_t *, vnode_t **, fid_t *);
 145 static int      nfs4_mountroot(vfs_t *, whymountroot_t);
 146 static void     nfs4_freevfs(vfs_t *);
 147
 148 static int      nfs4rootvp(vnode_t **, vfs_t *, struct servinfo4 *,
 149                     int, cred_t *, zone_t *);
 150
 151 vfsops_t        *nfs4_vfsops;
 152
 153 int nfs4_vfsinit(void);
 154 void nfs4_vfsfini(void);
 155 static void nfs4setclientid_init(void);
 156 static void nfs4setclientid_fini(void);
 157 static void nfs4setclientid_otw(mntinfo4_t *, servinfo4_t *,  cred_t *,
 158                 struct nfs4_server *, nfs4_error_t *, int *);
 159 static void     destroy_nfs4_server(nfs4_server_t *);
 160 static void     remove_mi(nfs4_server_t *, mntinfo4_t *);
 161
 162 extern void nfs4_ephemeral_init(void);
 163 extern void nfs4_ephemeral_fini(void);
 164
 165 /* referral related routines */
 166 static servinfo4_t *copy_svp(servinfo4_t *);
 167 static void free_knconf_contents(struct knetconfig *k);
 168 static char *extract_referral_point(const char *, int);
 169 static void setup_newsvpath(servinfo4_t *, int);
 170 static void update_servinfo4(servinfo4_t *, fs_location4 *,
 171                 struct nfs_fsl_info *, char *, int);
 172
 173 /*
 174  * Initialize the vfs structure
 175  */
 176
 177 static int nfs4fstyp;
 178
 179
 180 /*
 181  * Debug variable to check for rdma based
 182  * transport startup and cleanup. Controlled
 183  * through /etc/system. Off by default.
 184  */
 185 extern int rdma_debug;
 186
 187 int
 188 nfs4init(int fstyp, char *name)
 189 {
 190         static const fs_operation_def_t nfs4_vfsops_template[] = {
 191                 VFSNAME_MOUNT,          { .vfs_mount = nfs4_mount },
 192                 VFSNAME_UNMOUNT,        { .vfs_unmount = nfs4_unmount },
 193                 VFSNAME_ROOT,           { .vfs_root = nfs4_root },
 194                 VFSNAME_STATVFS,        { .vfs_statvfs = nfs4_statvfs },
 195                 VFSNAME_SYNC,           { .vfs_sync = nfs4_sync },
 196                 VFSNAME_VGET,           { .vfs_vget = nfs4_vget },
 197                 VFSNAME_MOUNTROOT,      { .vfs_mountroot = nfs4_mountroot },
 198                 VFSNAME_FREEVFS,        { .vfs_freevfs = nfs4_freevfs },
 199                 NULL,                   NULL
 200         };
 201         int error;
 202
 203         nfs4_vfsops = NULL;
 204         nfs4_vnodeops = NULL;
 205         nfs4_trigger_vnodeops = NULL;
 206
 207         error = vfs_setfsops(fstyp, nfs4_vfsops_template, &nfs4_vfsops);
 208         if (error != 0) {
 209                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 210                     "nfs4init: bad vfs ops template");
 211                 goto out;
 212         }
 213
 214         error = vn_make_ops(name, nfs4_vnodeops_template, &nfs4_vnodeops);
 215         if (error != 0) {
 216                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 217                     "nfs4init: bad vnode ops template");
 218                 goto out;
 219         }
 220
 221         error = vn_make_ops("nfs4_trigger", nfs4_trigger_vnodeops_template,
 222             &nfs4_trigger_vnodeops);
 223         if (error != 0) {
 224                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 225                     "nfs4init: bad trigger vnode ops template");
 226                 goto out;
 227         }
 228
 229         nfs4fstyp = fstyp;
 230         (void) nfs4_vfsinit();
 231         (void) nfs4_init_dot_entries();
 232
 233 out:
 234         if (error) {
 235                 if (nfs4_trigger_vnodeops != NULL)
 236                         vn_freevnodeops(nfs4_trigger_vnodeops);
 237
 238                 if (nfs4_vnodeops != NULL)
 239                         vn_freevnodeops(nfs4_vnodeops);
 240
 241                 (void) vfs_freevfsops_by_type(fstyp);
 242         }
 243
 244         return (error);
 245 }
 246
 247 void
 248 nfs4fini(void)
 249 {
 250         (void) nfs4_destroy_dot_entries();
 251         nfs4_vfsfini();
 252 }
 253
 254 /*
 255  * Create a new sec_data structure to store AUTH_DH related data:
 256  * netname, syncaddr, knetconfig. There is no AUTH_F_RPCTIMESYNC
 257  * flag set for NFS V4 since we are avoiding to contact the rpcbind
 258  * daemon and is using the IP time service (IPPORT_TIMESERVER).
 259  *
 260  * sec_data can be freed by sec_clnt_freeinfo().
 261  */
 262 static struct sec_data *
 263 create_authdh_data(char *netname, int nlen, struct netbuf *syncaddr,
 264                 struct knetconfig *knconf) {
 265         struct sec_data *secdata;
 266         dh_k4_clntdata_t *data;
 267         char *pf, *p;
 268
 269         if (syncaddr == NULL || syncaddr->buf == NULL || nlen == 0)
 270                 return (NULL);
 271
 272         secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
 273         secdata->flags = 0;
 274
 275         data = kmem_alloc(sizeof (*data), KM_SLEEP);
 276
 277         data->syncaddr.maxlen = syncaddr->maxlen;
 278         data->syncaddr.len = syncaddr->len;
 279         data->syncaddr.buf = kmem_alloc(syncaddr->len, KM_SLEEP);
 280         bcopy(syncaddr->buf, data->syncaddr.buf, syncaddr->len);
 281
 282         /*
 283          * duplicate the knconf information for the
 284          * new opaque data.
 285          */
 286         data->knconf = kmem_alloc(sizeof (*knconf), KM_SLEEP);
 287         *data->knconf = *knconf;
 288         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 289         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 290         bcopy(knconf->knc_protofmly, pf, KNC_STRSIZE);
 291         bcopy(knconf->knc_proto, p, KNC_STRSIZE);
 292         data->knconf->knc_protofmly = pf;
 293         data->knconf->knc_proto = p;
 294
 295         /* move server netname to the sec_data structure */
 296         data->netname = kmem_alloc(nlen, KM_SLEEP);
 297         bcopy(netname, data->netname, nlen);
 298         data->netnamelen = (int)nlen;
 299
 300         secdata->secmod = AUTH_DH;
 301         secdata->rpcflavor = AUTH_DH;
 302         secdata->data = (caddr_t)data;
 303
 304         return (secdata);
 305 }
 306
 307 /*
 308  * Returns (deep) copy of sec_data_t. Allocates all memory required; caller
 309  * is responsible for freeing.
 310  */
 311 sec_data_t *
 312 copy_sec_data(sec_data_t *fsecdata) {
 313         sec_data_t *tsecdata;
 314
 315         if (fsecdata == NULL)
 316                 return (NULL);
 317
 318         if (fsecdata->rpcflavor == AUTH_DH) {
 319                 dh_k4_clntdata_t *fdata = (dh_k4_clntdata_t *)fsecdata->data;
 320
 321                 if (fdata == NULL)
 322                         return (NULL);
 323
 324                 tsecdata = (sec_data_t *)create_authdh_data(fdata->netname,
 325                     fdata->netnamelen, &fdata->syncaddr, fdata->knconf);
 326
 327                 return (tsecdata);
 328         }
 329
 330         tsecdata = kmem_zalloc(sizeof (sec_data_t), KM_SLEEP);
 331
 332         tsecdata->secmod = fsecdata->secmod;
 333         tsecdata->rpcflavor = fsecdata->rpcflavor;
 334         tsecdata->flags = fsecdata->flags;
 335         tsecdata->uid = fsecdata->uid;
 336
 337         if (fsecdata->rpcflavor == RPCSEC_GSS) {
 338                 gss_clntdata_t *gcd = (gss_clntdata_t *)fsecdata->data;
 339
 340                 tsecdata->data = (caddr_t)copy_sec_data_gss(gcd);
 341         } else {
 342                 tsecdata->data = NULL;
 343         }
 344
 345         return (tsecdata);
 346 }
 347
 348 gss_clntdata_t *
 349 copy_sec_data_gss(gss_clntdata_t *fdata)
 350 {
 351         gss_clntdata_t *tdata;
 352
 353         if (fdata == NULL)
 354                 return (NULL);
 355
 356         tdata = kmem_zalloc(sizeof (gss_clntdata_t), KM_SLEEP);
 357
 358         tdata->mechanism.length = fdata->mechanism.length;
 359         tdata->mechanism.elements = kmem_zalloc(fdata->mechanism.length,
 360             KM_SLEEP);
 361         bcopy(fdata->mechanism.elements, tdata->mechanism.elements,
 362             fdata->mechanism.length);
 363
 364         tdata->service = fdata->service;
 365
 366         (void) strcpy(tdata->uname, fdata->uname);
 367         (void) strcpy(tdata->inst, fdata->inst);
 368         (void) strcpy(tdata->realm, fdata->realm);
 369
 370         tdata->qop = fdata->qop;
 371
 372         return (tdata);
 373 }
 374
 375 static int
 376 nfs4_chkdup_servinfo4(servinfo4_t *svp_head, servinfo4_t *svp)
 377 {
 378         servinfo4_t *si;
 379
 380         /*
 381          * Iterate over the servinfo4 list to make sure
 382          * we do not have a duplicate. Skip any servinfo4
 383          * that has been marked "NOT IN USE"
 384          */
 385         for (si = svp_head; si; si = si->sv_next) {
 386                 (void) nfs_rw_enter_sig(&si->sv_lock, RW_READER, 0);
 387                 if (si->sv_flags & SV4_NOTINUSE) {
 388                         nfs_rw_exit(&si->sv_lock);
 389                         continue;
 390                 }
 391                 nfs_rw_exit(&si->sv_lock);
 392                 if (si == svp)
 393                         continue;
 394                 if (si->sv_addr.len == svp->sv_addr.len &&
 395                     strcmp(si->sv_knconf->knc_protofmly,
 396                     svp->sv_knconf->knc_protofmly) == 0 &&
 397                     bcmp(si->sv_addr.buf, svp->sv_addr.buf,
 398                     si->sv_addr.len) == 0) {
 399                         /* it's a duplicate */
 400                         return (1);
 401                 }
 402         }
 403         /* it's not a duplicate */
 404         return (0);
 405 }
 406
 407 void
 408 nfs4_free_args(struct nfs_args *nargs)
 409 {
 410         if (nargs->knconf) {
 411                 if (nargs->knconf->knc_protofmly)
 412                         kmem_free(nargs->knconf->knc_protofmly,
 413                             KNC_STRSIZE);
 414                 if (nargs->knconf->knc_proto)
 415                         kmem_free(nargs->knconf->knc_proto, KNC_STRSIZE);
 416                 kmem_free(nargs->knconf, sizeof (*nargs->knconf));
 417                 nargs->knconf = NULL;
 418         }
 419
 420         if (nargs->fh) {
 421                 kmem_free(nargs->fh, strlen(nargs->fh) + 1);
 422                 nargs->fh = NULL;
 423         }
 424
 425         if (nargs->hostname) {
 426                 kmem_free(nargs->hostname, strlen(nargs->hostname) + 1);
 427                 nargs->hostname = NULL;
 428         }
 429
 430         if (nargs->addr) {
 431                 if (nargs->addr->buf) {
 432                         ASSERT(nargs->addr->len);
 433                         kmem_free(nargs->addr->buf, nargs->addr->len);
 434                 }
 435                 kmem_free(nargs->addr, sizeof (struct netbuf));
 436                 nargs->addr = NULL;
 437         }
 438
 439         if (nargs->syncaddr) {
 440                 ASSERT(nargs->syncaddr->len);
 441                 if (nargs->syncaddr->buf) {
 442                         ASSERT(nargs->syncaddr->len);
 443                         kmem_free(nargs->syncaddr->buf, nargs->syncaddr->len);
 444                 }
 445                 kmem_free(nargs->syncaddr, sizeof (struct netbuf));
 446                 nargs->syncaddr = NULL;
 447         }
 448
 449         if (nargs->netname) {
 450                 kmem_free(nargs->netname, strlen(nargs->netname) + 1);
 451                 nargs->netname = NULL;
 452         }
 453
 454         if (nargs->nfs_ext_u.nfs_extA.secdata) {
 455                 sec_clnt_freeinfo(
 456                     nargs->nfs_ext_u.nfs_extA.secdata);
 457                 nargs->nfs_ext_u.nfs_extA.secdata = NULL;
 458         }
 459 }
 460
 461
 462 int
 463 nfs4_copyin(char *data, int datalen, struct nfs_args *nargs)
 464 {
 465
 466         int error;
 467         size_t hlen;                    /* length of hostname */
 468         size_t nlen;                    /* length of netname */
 469         char netname[MAXNETNAMELEN+1];  /* server's netname */
 470         struct netbuf addr;             /* server's address */
 471         struct netbuf syncaddr;         /* AUTH_DES time sync addr */
 472         struct knetconfig *knconf;              /* transport structure */
 473         struct sec_data *secdata = NULL;        /* security data */
 474         STRUCT_DECL(nfs_args, args);            /* nfs mount arguments */
 475         STRUCT_DECL(knetconfig, knconf_tmp);
 476         STRUCT_DECL(netbuf, addr_tmp);
 477         int flags;
 478         char *p, *pf;
 479         struct pathname pn;
 480         char *userbufptr;
 481
 482
 483         bzero(nargs, sizeof (*nargs));
 484
 485         STRUCT_INIT(args, get_udatamodel());
 486         bzero(STRUCT_BUF(args), SIZEOF_STRUCT(nfs_args, DATAMODEL_NATIVE));
 487         if (copyin(data, STRUCT_BUF(args), MIN(datalen,
 488             STRUCT_SIZE(args))))
 489                 return (EFAULT);
 490
 491         nargs->wsize = STRUCT_FGET(args, wsize);
 492         nargs->rsize = STRUCT_FGET(args, rsize);
 493         nargs->timeo = STRUCT_FGET(args, timeo);
 494         nargs->retrans = STRUCT_FGET(args, retrans);
 495         nargs->acregmin = STRUCT_FGET(args, acregmin);
 496         nargs->acregmax = STRUCT_FGET(args, acregmax);
 497         nargs->acdirmin = STRUCT_FGET(args, acdirmin);
 498         nargs->acdirmax = STRUCT_FGET(args, acdirmax);
 499
 500         flags = STRUCT_FGET(args, flags);
 501         nargs->flags = flags;
 502
 503         addr.buf = NULL;
 504         syncaddr.buf = NULL;
 505
 506
 507         /*
 508          * Allocate space for a knetconfig structure and
 509          * its strings and copy in from user-land.
 510          */
 511         knconf = kmem_zalloc(sizeof (*knconf), KM_SLEEP);
 512         STRUCT_INIT(knconf_tmp, get_udatamodel());
 513         if (copyin(STRUCT_FGETP(args, knconf), STRUCT_BUF(knconf_tmp),
 514             STRUCT_SIZE(knconf_tmp))) {
 515                 kmem_free(knconf, sizeof (*knconf));
 516                 return (EFAULT);
 517         }
 518
 519         knconf->knc_semantics = STRUCT_FGET(knconf_tmp, knc_semantics);
 520         knconf->knc_protofmly = STRUCT_FGETP(knconf_tmp, knc_protofmly);
 521         knconf->knc_proto = STRUCT_FGETP(knconf_tmp, knc_proto);
 522         if (get_udatamodel() != DATAMODEL_LP64) {
 523                 knconf->knc_rdev = expldev(STRUCT_FGET(knconf_tmp, knc_rdev));
 524         } else {
 525                 knconf->knc_rdev = STRUCT_FGET(knconf_tmp, knc_rdev);
 526         }
 527
 528         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 529         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 530         error = copyinstr(knconf->knc_protofmly, pf, KNC_STRSIZE, NULL);
 531         if (error) {
 532                 kmem_free(pf, KNC_STRSIZE);
 533                 kmem_free(p, KNC_STRSIZE);
 534                 kmem_free(knconf, sizeof (*knconf));
 535                 return (error);
 536         }
 537
 538         error = copyinstr(knconf->knc_proto, p, KNC_STRSIZE, NULL);
 539         if (error) {
 540                 kmem_free(pf, KNC_STRSIZE);
 541                 kmem_free(p, KNC_STRSIZE);
 542                 kmem_free(knconf, sizeof (*knconf));
 543                 return (error);
 544         }
 545
 546
 547         knconf->knc_protofmly = pf;
 548         knconf->knc_proto = p;
 549
 550         nargs->knconf = knconf;
 551
 552         /*
 553          * Get server address
 554          */
 555         STRUCT_INIT(addr_tmp, get_udatamodel());
 556         if (copyin(STRUCT_FGETP(args, addr), STRUCT_BUF(addr_tmp),
 557             STRUCT_SIZE(addr_tmp))) {
 558                 error = EFAULT;
 559                 goto errout;
 560         }
 561
 562         nargs->addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
 563         userbufptr = STRUCT_FGETP(addr_tmp, buf);
 564         addr.len = STRUCT_FGET(addr_tmp, len);
 565         addr.buf = kmem_alloc(addr.len, KM_SLEEP);
 566         addr.maxlen = addr.len;
 567         if (copyin(userbufptr, addr.buf, addr.len)) {
 568                 kmem_free(addr.buf, addr.len);
 569                 error = EFAULT;
 570                 goto errout;
 571         }
 572         bcopy(&addr, nargs->addr, sizeof (struct netbuf));
 573
 574         /*
 575          * Get the root fhandle
 576          */
 577         error = pn_get(STRUCT_FGETP(args, fh), UIO_USERSPACE, &pn);
 578         if (error)
 579                 goto errout;
 580
 581         /* Volatile fh: keep server paths, so use actual-size strings */
 582         nargs->fh = kmem_alloc(pn.pn_pathlen + 1, KM_SLEEP);
 583         bcopy(pn.pn_path, nargs->fh, pn.pn_pathlen);
 584         nargs->fh[pn.pn_pathlen] = '\0';
 585         pn_free(&pn);
 586
 587
 588         /*
 589          * Get server's hostname
 590          */
 591         if (flags & NFSMNT_HOSTNAME) {
 592                 error = copyinstr(STRUCT_FGETP(args, hostname),
 593                     netname, sizeof (netname), &hlen);
 594                 if (error)
 595                         goto errout;
 596                 nargs->hostname = kmem_zalloc(hlen, KM_SLEEP);
 597                 (void) strcpy(nargs->hostname, netname);
 598
 599         } else {
 600                 nargs->hostname = NULL;
 601         }
 602
 603
 604         /*
 605          * If there are syncaddr and netname data, load them in. This is
 606          * to support data needed for NFSV4 when AUTH_DH is the negotiated
 607          * flavor via SECINFO. (instead of using MOUNT protocol in V3).
 608          */
 609         netname[0] = '\0';
 610         if (flags & NFSMNT_SECURE) {
 611
 612                 /* get syncaddr */
 613                 STRUCT_INIT(addr_tmp, get_udatamodel());
 614                 if (copyin(STRUCT_FGETP(args, syncaddr), STRUCT_BUF(addr_tmp),
 615                     STRUCT_SIZE(addr_tmp))) {
 616                         error = EINVAL;
 617                         goto errout;
 618                 }
 619                 userbufptr = STRUCT_FGETP(addr_tmp, buf);
 620                 syncaddr.len = STRUCT_FGET(addr_tmp, len);
 621                 syncaddr.buf = kmem_alloc(syncaddr.len, KM_SLEEP);
 622                 syncaddr.maxlen = syncaddr.len;
 623                 if (copyin(userbufptr, syncaddr.buf, syncaddr.len)) {
 624                         kmem_free(syncaddr.buf, syncaddr.len);
 625                         error = EFAULT;
 626                         goto errout;
 627                 }
 628
 629                 nargs->syncaddr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
 630                 bcopy(&syncaddr, nargs->syncaddr, sizeof (struct netbuf));
 631
 632                 /* get server's netname */
 633                 if (copyinstr(STRUCT_FGETP(args, netname), netname,
 634                     sizeof (netname), &nlen)) {
 635                         error = EFAULT;
 636                         goto errout;
 637                 }
 638
 639                 netname[nlen] = '\0';
 640                 nargs->netname = kmem_zalloc(nlen, KM_SLEEP);
 641                 (void) strcpy(nargs->netname, netname);
 642         }
 643
 644         /*
 645          * Get the extention data which has the security data structure.
 646          * This includes data for AUTH_SYS as well.
 647          */
 648         if (flags & NFSMNT_NEWARGS) {
 649                 nargs->nfs_args_ext = STRUCT_FGET(args, nfs_args_ext);
 650                 if (nargs->nfs_args_ext == NFS_ARGS_EXTA ||
 651                     nargs->nfs_args_ext == NFS_ARGS_EXTB) {
 652                         /*
 653                          * Indicating the application is using the new
 654                          * sec_data structure to pass in the security
 655                          * data.
 656                          */
 657                         if (STRUCT_FGETP(args,
 658                             nfs_ext_u.nfs_extA.secdata) != NULL) {
 659                                 error = sec_clnt_loadinfo(
 660                                     (struct sec_data *)STRUCT_FGETP(args,
 661                                     nfs_ext_u.nfs_extA.secdata),
 662                                     &secdata, get_udatamodel());
 663                         }
 664                         nargs->nfs_ext_u.nfs_extA.secdata = secdata;
 665                 }
 666         }
 667
 668         if (error)
 669                 goto errout;
 670
 671         /*
 672          * Failover support:
 673          *
 674          * We may have a linked list of nfs_args structures,
 675          * which means the user is looking for failover.  If
 676          * the mount is either not "read-only" or "soft",
 677          * we want to bail out with EINVAL.
 678          */
 679         if (nargs->nfs_args_ext == NFS_ARGS_EXTB)
 680                 nargs->nfs_ext_u.nfs_extB.next =
 681                     STRUCT_FGETP(args, nfs_ext_u.nfs_extB.next);
 682
 683 errout:
 684         if (error)
 685                 nfs4_free_args(nargs);
 686
 687         return (error);
 688 }
 689
 690
 691 /*
 692  * nfs mount vfsop
 693  * Set up mount info record and attach it to vfs struct.
 694  */
 695 int
 696 nfs4_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 697 {
 698         char *data = uap->dataptr;
 699         int error;
 700         vnode_t *rtvp;                  /* the server's root */
 701         mntinfo4_t *mi;                 /* mount info, pointed at by vfs */
 702         struct knetconfig *rdma_knconf; /* rdma transport structure */
 703         rnode4_t *rp;
 704         struct servinfo4 *svp;          /* nfs server info */
 705         struct servinfo4 *svp_tail = NULL; /* previous nfs server info */
 706         struct servinfo4 *svp_head;     /* first nfs server info */
 707         struct servinfo4 *svp_2ndlast;  /* 2nd last in server info list */
 708         struct sec_data *secdata;       /* security data */
 709         struct nfs_args *args = NULL;
 710         int flags, addr_type, removed;
 711         zone_t *zone = nfs_zone();
 712         nfs4_error_t n4e;
 713         zone_t *mntzone = NULL;
 714
 715         if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
 716                 return (EPERM);
 717         if (mvp->v_type != VDIR)
 718                 return (ENOTDIR);
 719
 720         /*
 721          * get arguments
 722          *
 723          * nfs_args is now versioned and is extensible, so
 724          * uap->datalen might be different from sizeof (args)
 725          * in a compatible situation.
 726          */
 727 more:
 728         if (!(uap->flags & MS_SYSSPACE)) {
 729                 if (args == NULL)
 730                         args = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
 731                 else
 732                         nfs4_free_args(args);
 733                 error = nfs4_copyin(data, uap->datalen, args);
 734                 if (error) {
 735                         if (args) {
 736                                 kmem_free(args, sizeof (*args));
 737                         }
 738                         return (error);
 739                 }
 740         } else {
 741                 args = (struct nfs_args *)data;
 742         }
 743
 744         flags = args->flags;
 745
 746         /*
 747          * If the request changes the locking type, disallow the remount,
 748          * because it's questionable whether we can transfer the
 749          * locking state correctly.
 750          */
 751         if (uap->flags & MS_REMOUNT) {
 752                 if (!(uap->flags & MS_SYSSPACE)) {
 753                         nfs4_free_args(args);
 754                         kmem_free(args, sizeof (*args));
 755                 }
 756                 if ((mi = VFTOMI4(vfsp)) != NULL) {
 757                         uint_t new_mi_llock;
 758                         uint_t old_mi_llock;
 759                         new_mi_llock = (flags & NFSMNT_LLOCK) ? 1 : 0;
 760                         old_mi_llock = (mi->mi_flags & MI4_LLOCK) ? 1 : 0;
 761                         if (old_mi_llock != new_mi_llock)
 762                                 return (EBUSY);
 763                 }
 764                 return (0);
 765         }
 766
 767         /*
 768          * For ephemeral mount trigger stub vnodes, we have two problems
 769          * to solve: racing threads will likely fail the v_count check, and
 770          * we want only one to proceed with the mount.
 771          *
 772          * For stubs, if the mount has already occurred (via a racing thread),
 773          * just return success. If not, skip the v_count check and proceed.
 774          * Note that we are already serialised at this point.
 775          */
 776         mutex_enter(&mvp->v_lock);
 777         if (vn_matchops(mvp, nfs4_trigger_vnodeops)) {
 778                 /* mntpt is a v4 stub vnode */
 779                 ASSERT(RP_ISSTUB(VTOR4(mvp)));
 780                 ASSERT(!(uap->flags & MS_OVERLAY));
 781                 ASSERT(!(mvp->v_flag & VROOT));
 782                 if (vn_mountedvfs(mvp) != NULL) {
 783                         /* ephemeral mount has already occurred */
 784                         ASSERT(uap->flags & MS_SYSSPACE);
 785                         mutex_exit(&mvp->v_lock);
 786                         return (0);
 787                 }
 788         } else {
 789                 /* mntpt is a non-v4 or v4 non-stub vnode */
 790                 if (!(uap->flags & MS_OVERLAY) &&
 791                     (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 792                         mutex_exit(&mvp->v_lock);
 793                         if (!(uap->flags & MS_SYSSPACE)) {
 794                                 nfs4_free_args(args);
 795                                 kmem_free(args, sizeof (*args));
 796                         }
 797                         return (EBUSY);
 798                 }
 799         }
 800         mutex_exit(&mvp->v_lock);
 801
 802         /* make sure things are zeroed for errout: */
 803         rtvp = NULL;
 804         mi = NULL;
 805         secdata = NULL;
 806
 807         /*
 808          * A valid knetconfig structure is required.
 809          */
 810         if (!(flags & NFSMNT_KNCONF) ||
 811             args->knconf == NULL || args->knconf->knc_protofmly == NULL ||
 812             args->knconf->knc_proto == NULL ||
 813             (strcmp(args->knconf->knc_proto, NC_UDP) == 0)) {
 814                 if (!(uap->flags & MS_SYSSPACE)) {
 815                         nfs4_free_args(args);
 816                         kmem_free(args, sizeof (*args));
 817                 }
 818                 return (EINVAL);
 819         }
 820
 821         if ((strlen(args->knconf->knc_protofmly) >= KNC_STRSIZE) ||
 822             (strlen(args->knconf->knc_proto) >= KNC_STRSIZE)) {
 823                 if (!(uap->flags & MS_SYSSPACE)) {
 824                         nfs4_free_args(args);
 825                         kmem_free(args, sizeof (*args));
 826                 }
 827                 return (EINVAL);
 828         }
 829
 830         /*
 831          * Allocate a servinfo4 struct.
 832          */
 833         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
 834         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
 835         if (svp_tail) {
 836                 svp_2ndlast = svp_tail;
 837                 svp_tail->sv_next = svp;
 838         } else {
 839                 svp_head = svp;
 840                 svp_2ndlast = svp;
 841         }
 842
 843         svp_tail = svp;
 844         svp->sv_knconf = args->knconf;
 845         args->knconf = NULL;
 846
 847         /*
 848          * Get server address
 849          */
 850         if (args->addr == NULL || args->addr->buf == NULL) {
 851                 error = EINVAL;
 852                 goto errout;
 853         }
 854
 855         svp->sv_addr.maxlen = args->addr->maxlen;
 856         svp->sv_addr.len = args->addr->len;
 857         svp->sv_addr.buf = args->addr->buf;
 858         args->addr->buf = NULL;
 859
 860         /*
 861          * Get the root fhandle
 862          */
 863         if (args->fh == NULL || (strlen(args->fh) >= MAXPATHLEN)) {
 864                 error = EINVAL;
 865                 goto errout;
 866         }
 867
 868         svp->sv_path = args->fh;
 869         svp->sv_pathlen = strlen(args->fh) + 1;
 870         args->fh = NULL;
 871
 872         /*
 873          * Get server's hostname
 874          */
 875         if (flags & NFSMNT_HOSTNAME) {
 876                 if (args->hostname == NULL || (strlen(args->hostname) >
 877                     MAXNETNAMELEN)) {
 878                         error = EINVAL;
 879                         goto errout;
 880                 }
 881                 svp->sv_hostnamelen = strlen(args->hostname) + 1;
 882                 svp->sv_hostname = args->hostname;
 883                 args->hostname = NULL;
 884         } else {
 885                 char *p = "unknown-host";
 886                 svp->sv_hostnamelen = strlen(p) + 1;
 887                 svp->sv_hostname = kmem_zalloc(svp->sv_hostnamelen, KM_SLEEP);
 888                 (void) strcpy(svp->sv_hostname, p);
 889         }
 890
 891         /*
 892          * RDMA MOUNT SUPPORT FOR NFS v4.
 893          * Establish, is it possible to use RDMA, if so overload the
 894          * knconf with rdma specific knconf and free the orignal knconf.
 895          */
 896         if ((flags & NFSMNT_TRYRDMA) || (flags & NFSMNT_DORDMA)) {
 897                 /*
 898                  * Determine the addr type for RDMA, IPv4 or v6.
 899                  */
 900                 if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET) == 0)
 901                         addr_type = AF_INET;
 902                 else if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET6) == 0)
 903                         addr_type = AF_INET6;
 904
 905                 if (rdma_reachable(addr_type, &svp->sv_addr,
 906                     &rdma_knconf) == 0) {
 907                         /*
 908                          * If successful, hijack the orignal knconf and
 909                          * replace with the new one, depending on the flags.
 910                          */
 911                         svp->sv_origknconf = svp->sv_knconf;
 912                         svp->sv_knconf = rdma_knconf;
 913                 } else {
 914                         if (flags & NFSMNT_TRYRDMA) {
 915 #ifdef  DEBUG
 916                                 if (rdma_debug)
 917                                         zcmn_err(getzoneid(), CE_WARN,
 918                                             "no RDMA onboard, revert\n");
 919 #endif
 920                         }
 921
 922                         if (flags & NFSMNT_DORDMA) {
 923                                 /*
 924                                  * If proto=rdma is specified and no RDMA
 925                                  * path to this server is avialable then
 926                                  * ditch this server.
 927                                  * This is not included in the mountable
 928                                  * server list or the replica list.
 929                                  * Check if more servers are specified;
 930                                  * Failover case, otherwise bail out of mount.
 931                                  */
 932                                 if (args->nfs_args_ext == NFS_ARGS_EXTB &&
 933                                     args->nfs_ext_u.nfs_extB.next != NULL) {
 934                                         data = (char *)
 935                                             args->nfs_ext_u.nfs_extB.next;
 936                                         if (uap->flags & MS_RDONLY &&
 937                                             !(flags & NFSMNT_SOFT)) {
 938                                                 if (svp_head->sv_next == NULL) {
 939                                                         svp_tail = NULL;
 940                                                         svp_2ndlast = NULL;
 941                                                         sv4_free(svp_head);
 942                                                         goto more;
 943                                                 } else {
 944                                                         svp_tail = svp_2ndlast;
 945                                                         svp_2ndlast->sv_next =
 946                                                             NULL;
 947                                                         sv4_free(svp);
 948                                                         goto more;
 949                                                 }
 950                                         }
 951                                 } else {
 952                                         /*
 953                                          * This is the last server specified
 954                                          * in the nfs_args list passed down
 955                                          * and its not rdma capable.
 956                                          */
 957                                         if (svp_head->sv_next == NULL) {
 958                                                 /*
 959                                                  * Is this the only one
 960                                                  */
 961                                                 error = EINVAL;
 962 #ifdef  DEBUG
 963                                                 if (rdma_debug)
 964                                                         zcmn_err(getzoneid(),
 965                                                             CE_WARN,
 966                                                             "No RDMA srv");
 967 #endif
 968                                                 goto errout;
 969                                         } else {
 970                                                 /*
 971                                                  * There is list, since some
 972                                                  * servers specified before
 973                                                  * this passed all requirements
 974                                                  */
 975                                                 svp_tail = svp_2ndlast;
 976                                                 svp_2ndlast->sv_next = NULL;
 977                                                 sv4_free(svp);
 978                                                 goto proceed;
 979                                         }
 980                                 }
 981                         }
 982                 }
 983         }
 984
 985         /*
 986          * If there are syncaddr and netname data, load them in. This is
 987          * to support data needed for NFSV4 when AUTH_DH is the negotiated
 988          * flavor via SECINFO. (instead of using MOUNT protocol in V3).
 989          */
 990         if (args->flags & NFSMNT_SECURE) {
 991                 svp->sv_dhsec = create_authdh_data(args->netname,
 992                     strlen(args->netname),
 993                     args->syncaddr, svp->sv_knconf);
 994         }
 995
 996         /*
 997          * Get the extention data which has the security data structure.
 998          * This includes data for AUTH_SYS as well.
 999          */
1000         if (flags & NFSMNT_NEWARGS) {
1001                 switch (args->nfs_args_ext) {
1002                 case NFS_ARGS_EXTA:
1003                 case NFS_ARGS_EXTB:
1004                         /*
1005                          * Indicating the application is using the new
1006                          * sec_data structure to pass in the security
1007                          * data.
1008                          */
1009                         secdata = args->nfs_ext_u.nfs_extA.secdata;
1010                         if (secdata == NULL) {
1011                                 error = EINVAL;
1012                         } else if (uap->flags & MS_SYSSPACE) {
1013                                 /*
1014                                  * Need to validate the flavor here if
1015                                  * sysspace, userspace was already
1016                                  * validate from the nfs_copyin function.
1017                                  */
1018                                 switch (secdata->rpcflavor) {
1019                                 case AUTH_NONE:
1020                                 case AUTH_UNIX:
1021                                 case AUTH_LOOPBACK:
1022                                 case AUTH_DES:
1023                                 case RPCSEC_GSS:
1024                                         break;
1025                                 default:
1026                                         error = EINVAL;
1027                                         goto errout;
1028                                 }
1029                         }
1030                         args->nfs_ext_u.nfs_extA.secdata = NULL;
1031                         break;
1032
1033                 default:
1034                         error = EINVAL;
1035                         break;
1036                 }
1037
1038         } else if (flags & NFSMNT_SECURE) {
1039                 /*
1040                  * NFSMNT_SECURE is deprecated but we keep it
1041                  * to support the rogue user-generated application
1042                  * that may use this undocumented interface to do
1043                  * AUTH_DH security, e.g. our own rexd.
1044                  *
1045                  * Also note that NFSMNT_SECURE is used for passing
1046                  * AUTH_DH info to be used in negotiation.
1047                  */
1048                 secdata = create_authdh_data(args->netname,
1049                     strlen(args->netname), args->syncaddr, svp->sv_knconf);
1050
1051         } else {
1052                 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1053                 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
1054                 secdata->data = NULL;
1055         }
1056
1057         svp->sv_secdata = secdata;
1058
1059         /*
1060          * User does not explictly specify a flavor, and a user
1061          * defined default flavor is passed down.
1062          */
1063         if (flags & NFSMNT_SECDEFAULT) {
1064                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1065                 svp->sv_flags |= SV4_TRYSECDEFAULT;
1066                 nfs_rw_exit(&svp->sv_lock);
1067         }
1068
1069         /*
1070          * Failover support:
1071          *
1072          * We may have a linked list of nfs_args structures,
1073          * which means the user is looking for failover.  If
1074          * the mount is either not "read-only" or "soft",
1075          * we want to bail out with EINVAL.
1076          */
1077         if (args->nfs_args_ext == NFS_ARGS_EXTB &&
1078             args->nfs_ext_u.nfs_extB.next != NULL) {
1079                 if (uap->flags & MS_RDONLY && !(flags & NFSMNT_SOFT)) {
1080                         data = (char *)args->nfs_ext_u.nfs_extB.next;
1081                         goto more;
1082                 }
1083                 error = EINVAL;
1084                 goto errout;
1085         }
1086
1087         /*
1088          * Determine the zone we're being mounted into.
1089          */
1090         zone_hold(mntzone = zone);              /* start with this assumption */
1091         if (getzoneid() == GLOBAL_ZONEID) {
1092                 zone_rele(mntzone);
1093                 mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
1094                 ASSERT(mntzone != NULL);
1095                 if (mntzone != zone) {
1096                         error = EBUSY;
1097                         goto errout;
1098                 }
1099         }
1100
1101         /*
1102          * Stop the mount from going any further if the zone is going away.
1103          */
1104         if (zone_status_get(mntzone) >= ZONE_IS_SHUTTING_DOWN) {
1105                 error = EBUSY;
1106                 goto errout;
1107         }
1108
1109         /*
1110          * Get root vnode.
1111          */
1112 proceed:
1113         error = nfs4rootvp(&rtvp, vfsp, svp_head, flags, cr, mntzone);
1114         if (error) {
1115                 /* if nfs4rootvp failed, it will free svp_head */
1116                 svp_head = NULL;
1117                 goto errout;
1118         }
1119
1120         mi = VTOMI4(rtvp);
1121
1122         /*
1123          * Send client id to the server, if necessary
1124          */
1125         nfs4_error_zinit(&n4e);
1126         nfs4setclientid(mi, cr, FALSE, &n4e);
1127
1128         error = n4e.error;
1129
1130         if (error)
1131                 goto errout;
1132
1133         /*
1134          * Set option fields in the mount info record
1135          */
1136
1137         if (svp_head->sv_next) {
1138                 mutex_enter(&mi->mi_lock);
1139                 mi->mi_flags |= MI4_LLOCK;
1140                 mutex_exit(&mi->mi_lock);
1141         }
1142         error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, args);
1143         if (error)
1144                 goto errout;
1145
1146         /*
1147          * Time to tie in the mirror mount info at last!
1148          */
1149         if (flags & NFSMNT_EPHEMERAL)
1150                 error = nfs4_record_ephemeral_mount(mi, mvp);
1151
1152 errout:
1153         if (error) {
1154                 if (rtvp != NULL) {
1155                         rp = VTOR4(rtvp);
1156                         if (rp->r_flags & R4HASHED)
1157                                 rp4_rmhash(rp);
1158                 }
1159                 if (mi != NULL) {
1160                         nfs4_async_stop(vfsp);
1161                         nfs4_async_manager_stop(vfsp);
1162                         nfs4_remove_mi_from_server(mi, NULL);
1163                         if (rtvp != NULL)
1164                                 VN_RELE(rtvp);
1165                         if (mntzone != NULL)
1166                                 zone_rele(mntzone);
1167                         /* need to remove it from the zone */
1168                         removed = nfs4_mi_zonelist_remove(mi);
1169                         if (removed)
1170                                 zone_rele_ref(&mi->mi_zone_ref,
1171                                     ZONE_REF_NFSV4);
1172                         MI4_RELE(mi);
1173                         if (!(uap->flags & MS_SYSSPACE) && args) {
1174                                 nfs4_free_args(args);
1175                                 kmem_free(args, sizeof (*args));
1176                         }
1177                         return (error);
1178                 }
1179                 if (svp_head)
1180                         sv4_free(svp_head);
1181         }
1182
1183         if (!(uap->flags & MS_SYSSPACE) && args) {
1184                 nfs4_free_args(args);
1185                 kmem_free(args, sizeof (*args));
1186         }
1187         if (rtvp != NULL)
1188                 VN_RELE(rtvp);
1189
1190         if (mntzone != NULL)
1191                 zone_rele(mntzone);
1192
1193         return (error);
1194 }
1195
1196 #ifdef  DEBUG
1197 #define VERS_MSG        "NFS4 server "
1198 #else
1199 #define VERS_MSG        "NFS server "
1200 #endif
1201
1202 #define READ_MSG        \
1203         VERS_MSG "%s returned 0 for read transfer size"
1204 #define WRITE_MSG       \
1205         VERS_MSG "%s returned 0 for write transfer size"
1206 #define SIZE_MSG        \
1207         VERS_MSG "%s returned 0 for maximum file size"
1208
1209 /*
1210  * Get the symbolic link text from the server for a given filehandle
1211  * of that symlink.
1212  *
1213  *      (get symlink text) PUTFH READLINK
1214  */
1215 static int
1216 getlinktext_otw(mntinfo4_t *mi, nfs_fh4 *fh, char **linktextp, cred_t *cr,
1217     int flags)
1218 {
1219         COMPOUND4args_clnt args;
1220         COMPOUND4res_clnt res;
1221         int doqueue;
1222         nfs_argop4 argop[2];
1223         nfs_resop4 *resop;
1224         READLINK4res *lr_res;
1225         uint_t len;
1226         bool_t needrecov = FALSE;
1227         nfs4_recov_state_t recov_state;
1228         nfs4_sharedfh_t *sfh;
1229         nfs4_error_t e;
1230         int num_retry = nfs4_max_mount_retry;
1231         int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1232
1233         sfh = sfh4_get(fh, mi);
1234         recov_state.rs_flags = 0;
1235         recov_state.rs_num_retry_despite_err = 0;
1236
1237 recov_retry:
1238         nfs4_error_zinit(&e);
1239
1240         args.array_len = 2;
1241         args.array = argop;
1242         args.ctag = TAG_GET_SYMLINK;
1243
1244         if (! recovery) {
1245                 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
1246                 if (e.error) {
1247                         sfh4_rele(&sfh);
1248                         return (e.error);
1249                 }
1250         }
1251
1252         /* 0. putfh symlink fh */
1253         argop[0].argop = OP_CPUTFH;
1254         argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1255
1256         /* 1. readlink */
1257         argop[1].argop = OP_READLINK;
1258
1259         doqueue = 1;
1260
1261         rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1262
1263         needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1264
1265         if (needrecov && !recovery && num_retry-- > 0) {
1266
1267                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1268                     "getlinktext_otw: initiating recovery\n"));
1269
1270                 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
1271                     OP_READLINK, NULL, NULL, NULL) == FALSE) {
1272                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1273                         if (!e.error)
1274                                 (void) xdr_free(xdr_COMPOUND4res_clnt,
1275                                     (caddr_t)&res);
1276                         goto recov_retry;
1277                 }
1278         }
1279
1280         /*
1281          * If non-NFS4 pcol error and/or we weren't able to recover.
1282          */
1283         if (e.error != 0) {
1284                 if (! recovery)
1285                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1286                 sfh4_rele(&sfh);
1287                 return (e.error);
1288         }
1289
1290         if (res.status) {
1291                 e.error = geterrno4(res.status);
1292                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1293                 if (! recovery)
1294                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1295                 sfh4_rele(&sfh);
1296                 return (e.error);
1297         }
1298
1299         /* res.status == NFS4_OK */
1300         ASSERT(res.status == NFS4_OK);
1301
1302         resop = &res.array[1];  /* readlink res */
1303         lr_res = &resop->nfs_resop4_u.opreadlink;
1304
1305         /* treat symlink name as data */
1306         *linktextp = utf8_to_str((utf8string *)&lr_res->link, &len, NULL);
1307
1308         if (! recovery)
1309                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1310         sfh4_rele(&sfh);
1311         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1312         return (0);
1313 }
1314
1315 /*
1316  * Skip over consecutive slashes and "/./" in a pathname.
1317  */
1318 void
1319 pathname_skipslashdot(struct pathname *pnp)
1320 {
1321         char *c1, *c2;
1322
1323         while (pnp->pn_pathlen > 0 && *pnp->pn_path == '/') {
1324
1325                 c1 = pnp->pn_path + 1;
1326                 c2 = pnp->pn_path + 2;
1327
1328                 if (*c1 == '.' && (*c2 == '/' || *c2 == '\0')) {
1329                         pnp->pn_path = pnp->pn_path + 2; /* skip "/." */
1330                         pnp->pn_pathlen = pnp->pn_pathlen - 2;
1331                 } else {
1332                         pnp->pn_path++;
1333                         pnp->pn_pathlen--;
1334                 }
1335         }
1336 }
1337
1338 /*
1339  * Resolve a symbolic link path. The symlink is in the nth component of
1340  * svp->sv_path and has an nfs4 file handle "fh".
1341  * Upon return, the sv_path will point to the new path that has the nth
1342  * component resolved to its symlink text.
1343  */
1344 int
1345 resolve_sympath(mntinfo4_t *mi, servinfo4_t *svp, int nth, nfs_fh4 *fh,
1346     cred_t *cr, int flags)
1347 {
1348         char *oldpath;
1349         char *symlink, *newpath;
1350         struct pathname oldpn, newpn;
1351         char component[MAXNAMELEN];
1352         int i, addlen, error = 0;
1353         int oldpathlen;
1354
1355         /* Get the symbolic link text over the wire. */
1356         error = getlinktext_otw(mi, fh, &symlink, cr, flags);
1357
1358         if (error || symlink == NULL || strlen(symlink) == 0)
1359                 return (error);
1360
1361         /*
1362          * Compose the new pathname.
1363          * Note:
1364          *    - only the nth component is resolved for the pathname.
1365          *    - pathname.pn_pathlen does not count the ending null byte.
1366          */
1367         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1368         oldpath = svp->sv_path;
1369         oldpathlen = svp->sv_pathlen;
1370         if (error = pn_get(oldpath, UIO_SYSSPACE, &oldpn)) {
1371                 nfs_rw_exit(&svp->sv_lock);
1372                 kmem_free(symlink, strlen(symlink) + 1);
1373                 return (error);
1374         }
1375         nfs_rw_exit(&svp->sv_lock);
1376         pn_alloc(&newpn);
1377
1378         /*
1379          * Skip over previous components from the oldpath so that the
1380          * oldpn.pn_path will point to the symlink component. Skip
1381          * leading slashes and "/./" (no OP_LOOKUP on ".") so that
1382          * pn_getcompnent can get the component.
1383          */
1384         for (i = 1; i < nth; i++) {
1385                 pathname_skipslashdot(&oldpn);
1386                 error = pn_getcomponent(&oldpn, component);
1387                 if (error)
1388                         goto out;
1389         }
1390
1391         /*
1392          * Copy the old path upto the component right before the symlink
1393          * if the symlink is not an absolute path.
1394          */
1395         if (symlink[0] != '/') {
1396                 addlen = oldpn.pn_path - oldpn.pn_buf;
1397                 bcopy(oldpn.pn_buf, newpn.pn_path, addlen);
1398                 newpn.pn_pathlen += addlen;
1399                 newpn.pn_path += addlen;
1400                 newpn.pn_buf[newpn.pn_pathlen] = '/';
1401                 newpn.pn_pathlen++;
1402                 newpn.pn_path++;
1403         }
1404
1405         /* copy the resolved symbolic link text */
1406         addlen = strlen(symlink);
1407         if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1408                 error = ENAMETOOLONG;
1409                 goto out;
1410         }
1411         bcopy(symlink, newpn.pn_path, addlen);
1412         newpn.pn_pathlen += addlen;
1413         newpn.pn_path += addlen;
1414
1415         /*
1416          * Check if there is any remaining path after the symlink component.
1417          * First, skip the symlink component.
1418          */
1419         pathname_skipslashdot(&oldpn);
1420         if (error = pn_getcomponent(&oldpn, component))
1421                 goto out;
1422
1423         addlen = pn_pathleft(&oldpn); /* includes counting the slash */
1424
1425         /*
1426          * Copy the remaining path to the new pathname if there is any.
1427          */
1428         if (addlen > 0) {
1429                 if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1430                         error = ENAMETOOLONG;
1431                         goto out;
1432                 }
1433                 bcopy(oldpn.pn_path, newpn.pn_path, addlen);
1434                 newpn.pn_pathlen += addlen;
1435         }
1436         newpn.pn_buf[newpn.pn_pathlen] = '\0';
1437
1438         /* get the newpath and store it in the servinfo4_t */
1439         newpath = kmem_alloc(newpn.pn_pathlen + 1, KM_SLEEP);
1440         bcopy(newpn.pn_buf, newpath, newpn.pn_pathlen);
1441         newpath[newpn.pn_pathlen] = '\0';
1442
1443         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1444         svp->sv_path = newpath;
1445         svp->sv_pathlen = strlen(newpath) + 1;
1446         nfs_rw_exit(&svp->sv_lock);
1447
1448         kmem_free(oldpath, oldpathlen);
1449 out:
1450         kmem_free(symlink, strlen(symlink) + 1);
1451         pn_free(&newpn);
1452         pn_free(&oldpn);
1453
1454         return (error);
1455 }
1456
1457 /*
1458  * This routine updates servinfo4 structure with the new referred server
1459  * info.
1460  * nfsfsloc has the location related information
1461  * fsp has the hostname and pathname info.
1462  * new path = pathname from referral + part of orig pathname(based on nth).
1463  */
1464 static void
1465 update_servinfo4(servinfo4_t *svp, fs_location4 *fsp,
1466     struct nfs_fsl_info *nfsfsloc, char *orig_path, int nth)
1467 {
1468         struct knetconfig *knconf, *svknconf;
1469         struct netbuf *saddr;
1470         sec_data_t      *secdata;
1471         utf8string *host;
1472         int i = 0, num_slashes = 0;
1473         char *p, *spath, *op, *new_path;
1474
1475         /* Update knconf */
1476         knconf = svp->sv_knconf;
1477         free_knconf_contents(knconf);
1478         bzero(knconf, sizeof (struct knetconfig));
1479         svknconf = nfsfsloc->knconf;
1480         knconf->knc_semantics = svknconf->knc_semantics;
1481         knconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1482         knconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1483         knconf->knc_rdev = svknconf->knc_rdev;
1484         bcopy(svknconf->knc_protofmly, knconf->knc_protofmly, KNC_STRSIZE);
1485         bcopy(svknconf->knc_proto, knconf->knc_proto, KNC_STRSIZE);
1486
1487         /* Update server address */
1488         saddr = &svp->sv_addr;
1489         if (saddr->buf != NULL)
1490                 kmem_free(saddr->buf, saddr->maxlen);
1491         saddr->buf  = kmem_alloc(nfsfsloc->addr->maxlen, KM_SLEEP);
1492         saddr->len = nfsfsloc->addr->len;
1493         saddr->maxlen = nfsfsloc->addr->maxlen;
1494         bcopy(nfsfsloc->addr->buf, saddr->buf, nfsfsloc->addr->len);
1495
1496         /* Update server name */
1497         host = fsp->server_val;
1498         kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
1499         svp->sv_hostname = kmem_zalloc(host->utf8string_len + 1, KM_SLEEP);
1500         bcopy(host->utf8string_val, svp->sv_hostname, host->utf8string_len);
1501         svp->sv_hostname[host->utf8string_len] = '\0';
1502         svp->sv_hostnamelen = host->utf8string_len + 1;
1503
1504         /*
1505          * Update server path.
1506          * We need to setup proper path here.
1507          * For ex., If we got a path name serv1:/rp/aaa/bbb
1508          * where aaa is a referral and points to serv2:/rpool/aa
1509          * we need to set the path to serv2:/rpool/aa/bbb
1510          * The first part of this below code generates /rpool/aa
1511          * and the second part appends /bbb to the server path.
1512          */
1513         spath = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1514         *p++ = '/';
1515         for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1516                 component4 *comp;
1517
1518                 comp = &fsp->rootpath.pathname4_val[i];
1519                 /* If no space, null the string and bail */
1520                 if ((p - spath) + comp->utf8string_len + 1 > MAXPATHLEN) {
1521                         p = spath + MAXPATHLEN - 1;
1522                         spath[0] = '\0';
1523                         break;
1524                 }
1525                 bcopy(comp->utf8string_val, p, comp->utf8string_len);
1526                 p += comp->utf8string_len;
1527                 *p++ = '/';
1528         }
1529         if (fsp->rootpath.pathname4_len != 0)
1530                 *(p - 1) = '\0';
1531         else
1532                 *p = '\0';
1533         p = spath;
1534
1535         new_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1536         (void) strlcpy(new_path, p, MAXPATHLEN);
1537         kmem_free(p, MAXPATHLEN);
1538         i = strlen(new_path);
1539
1540         for (op = orig_path; *op; op++) {
1541                 if (*op == '/')
1542                         num_slashes++;
1543                 if (num_slashes == nth + 2) {
1544                         while (*op != '\0') {
1545                                 new_path[i] = *op;
1546                                 i++;
1547                                 op++;
1548                         }
1549                         break;
1550                 }
1551         }
1552         new_path[i] = '\0';
1553
1554         kmem_free(svp->sv_path, svp->sv_pathlen);
1555         svp->sv_pathlen = strlen(new_path) + 1;
1556         svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
1557         bcopy(new_path, svp->sv_path, svp->sv_pathlen);
1558         kmem_free(new_path, MAXPATHLEN);
1559
1560         /*
1561          * All the security data is specific to old server.
1562          * Clean it up except secdata which deals with mount options.
1563          * We need to inherit that data. Copy secdata into our new servinfo4.
1564          */
1565         if (svp->sv_dhsec) {
1566                 sec_clnt_freeinfo(svp->sv_dhsec);
1567                 svp->sv_dhsec = NULL;
1568         }
1569         if (svp->sv_save_secinfo &&
1570             svp->sv_save_secinfo != svp->sv_secinfo) {
1571                 secinfo_free(svp->sv_save_secinfo);
1572                 svp->sv_save_secinfo = NULL;
1573         }
1574         if (svp->sv_secinfo) {
1575                 secinfo_free(svp->sv_secinfo);
1576                 svp->sv_secinfo = NULL;
1577         }
1578         svp->sv_currsec = NULL;
1579
1580         secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1581         *secdata = *svp->sv_secdata;
1582         secdata->data = NULL;
1583         if (svp->sv_secdata) {
1584                 sec_clnt_freeinfo(svp->sv_secdata);
1585                 svp->sv_secdata = NULL;
1586         }
1587         svp->sv_secdata = secdata;
1588 }
1589
1590 /*
1591  * Resolve a referral. The referral is in the n+1th component of
1592  * svp->sv_path and has a parent nfs4 file handle "fh".
1593  * Upon return, the sv_path will point to the new path that has referral
1594  * component resolved to its referred path and part of original path.
1595  * Hostname and other address information is also updated.
1596  */
1597 int
1598 resolve_referral(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, int nth,
1599     nfs_fh4 *fh)
1600 {
1601         nfs4_sharedfh_t *sfh;
1602         struct nfs_fsl_info nfsfsloc;
1603         nfs4_ga_res_t garp;
1604         COMPOUND4res_clnt callres;
1605         fs_location4    *fsp;
1606         char *nm, *orig_path;
1607         int orig_pathlen = 0, ret = -1, index;
1608
1609         if (svp->sv_pathlen <= 0)
1610                 return (ret);
1611
1612         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1613         orig_pathlen = svp->sv_pathlen;
1614         orig_path = kmem_alloc(orig_pathlen, KM_SLEEP);
1615         bcopy(svp->sv_path, orig_path, orig_pathlen);
1616         nm = extract_referral_point(svp->sv_path, nth);
1617         setup_newsvpath(svp, nth);
1618         nfs_rw_exit(&svp->sv_lock);
1619
1620         sfh = sfh4_get(fh, mi);
1621         index = nfs4_process_referral(mi, sfh, nm, cr,
1622             &garp, &callres, &nfsfsloc);
1623         sfh4_rele(&sfh);
1624         kmem_free(nm, MAXPATHLEN);
1625         if (index < 0) {
1626                 kmem_free(orig_path, orig_pathlen);
1627                 return (index);
1628         }
1629
1630         fsp =  &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1631         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1632         update_servinfo4(svp, fsp, &nfsfsloc, orig_path, nth);
1633         nfs_rw_exit(&svp->sv_lock);
1634
1635         mutex_enter(&mi->mi_lock);
1636         mi->mi_vfs_referral_loop_cnt++;
1637         mutex_exit(&mi->mi_lock);
1638
1639         ret = 0;
1640 bad:
1641         /* Free up XDR memory allocated in nfs4_process_referral() */
1642         xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1643         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1644         kmem_free(orig_path, orig_pathlen);
1645
1646         return (ret);
1647 }
1648
1649 /*
1650  * Get the root filehandle for the given filesystem and server, and update
1651  * svp.
1652  *
1653  * If NFS4_GETFH_NEEDSOP is set, then use nfs4_start_fop and nfs4_end_fop
1654  * to coordinate with recovery.  Otherwise, the caller is assumed to be
1655  * the recovery thread or have already done a start_fop.
1656  *
1657  * Errors are returned by the nfs4_error_t parameter.
1658  */
1659 static void
1660 nfs4getfh_otw(struct mntinfo4 *mi, servinfo4_t *svp, vtype_t *vtp,
1661     int flags, cred_t *cr, nfs4_error_t *ep)
1662 {
1663         COMPOUND4args_clnt args;
1664         COMPOUND4res_clnt res;
1665         int doqueue = 1;
1666         nfs_argop4 *argop;
1667         nfs_resop4 *resop;
1668         nfs4_ga_res_t *garp;
1669         int num_argops;
1670         lookup4_param_t lookuparg;
1671         nfs_fh4 *tmpfhp;
1672         nfs_fh4 *resfhp;
1673         bool_t needrecov = FALSE;
1674         nfs4_recov_state_t recov_state;
1675         int llndx;
1676         int nthcomp;
1677         int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1678
1679         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1680         ASSERT(svp->sv_path != NULL);
1681         if (svp->sv_path[0] == '\0') {
1682                 nfs_rw_exit(&svp->sv_lock);
1683                 nfs4_error_init(ep, EINVAL);
1684                 return;
1685         }
1686         nfs_rw_exit(&svp->sv_lock);
1687
1688         recov_state.rs_flags = 0;
1689         recov_state.rs_num_retry_despite_err = 0;
1690
1691 recov_retry:
1692         if (mi->mi_vfs_referral_loop_cnt >= NFS4_REFERRAL_LOOP_MAX) {
1693                 DTRACE_PROBE3(nfs4clnt__debug__referral__loop, mntinfo4 *,
1694                     mi, servinfo4_t *, svp, char *, "nfs4getfh_otw");
1695                 nfs4_error_init(ep, EINVAL);
1696                 return;
1697         }
1698         nfs4_error_zinit(ep);
1699
1700         if (!recovery) {
1701                 ep->error = nfs4_start_fop(mi, NULL, NULL, OH_MOUNT,
1702                     &recov_state, NULL);
1703
1704                 /*
1705                  * If recovery has been started and this request as
1706                  * initiated by a mount, then we must wait for recovery
1707                  * to finish before proceeding, otherwise, the error
1708                  * cleanup would remove data structures needed by the
1709                  * recovery thread.
1710                  */
1711                 if (ep->error) {
1712                         mutex_enter(&mi->mi_lock);
1713                         if (mi->mi_flags & MI4_MOUNTING) {
1714                                 mi->mi_flags |= MI4_RECOV_FAIL;
1715                                 mi->mi_error = EIO;
1716
1717                                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1718                                     "nfs4getfh_otw: waiting 4 recovery\n"));
1719
1720                                 while (mi->mi_flags & MI4_RECOV_ACTIV)
1721                                         cv_wait(&mi->mi_failover_cv,
1722                                             &mi->mi_lock);
1723                         }
1724                         mutex_exit(&mi->mi_lock);
1725                         return;
1726                 }
1727
1728                 /*
1729                  * If the client does not specify a specific flavor to use
1730                  * and has not gotten a secinfo list from the server yet,
1731                  * retrieve the secinfo list from the server and use a
1732                  * flavor from the list to mount.
1733                  *
1734                  * If fail to get the secinfo list from the server, then
1735                  * try the default flavor.
1736                  */
1737                 if ((svp->sv_flags & SV4_TRYSECDEFAULT) &&
1738                     svp->sv_secinfo == NULL) {
1739                         (void) nfs4_secinfo_path(mi, cr, FALSE);
1740                 }
1741         }
1742
1743         if (recovery)
1744                 args.ctag = TAG_REMAP_MOUNT;
1745         else
1746                 args.ctag = TAG_MOUNT;
1747
1748         lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1749         lookuparg.argsp = &args;
1750         lookuparg.resp = &res;
1751         lookuparg.header_len = 2;       /* Putrootfh, getfh */
1752         lookuparg.trailer_len = 0;
1753         lookuparg.ga_bits = FATTR4_FSINFO_MASK;
1754         lookuparg.mi = mi;
1755
1756         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1757         ASSERT(svp->sv_path != NULL);
1758         llndx = nfs4lookup_setup(svp->sv_path, &lookuparg, 0);
1759         nfs_rw_exit(&svp->sv_lock);
1760
1761         argop = args.array;
1762         num_argops = args.array_len;
1763
1764         /* choose public or root filehandle */
1765         if (flags & NFS4_GETFH_PUBLIC)
1766                 argop[0].argop = OP_PUTPUBFH;
1767         else
1768                 argop[0].argop = OP_PUTROOTFH;
1769
1770         /* get fh */
1771         argop[1].argop = OP_GETFH;
1772
1773         NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1774             "nfs4getfh_otw: %s call, mi 0x%p",
1775             needrecov ? "recov" : "first", (void *)mi));
1776
1777         rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1778
1779         needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
1780
1781         if (needrecov) {
1782                 bool_t abort;
1783
1784                 if (recovery) {
1785                         nfs4args_lookup_free(argop, num_argops);
1786                         kmem_free(argop,
1787                             lookuparg.arglen * sizeof (nfs_argop4));
1788                         if (!ep->error)
1789                                 (void) xdr_free(xdr_COMPOUND4res_clnt,
1790                                     (caddr_t)&res);
1791                         return;
1792                 }
1793
1794                 NFS4_DEBUG(nfs4_client_recov_debug,
1795                     (CE_NOTE, "nfs4getfh_otw: initiating recovery\n"));
1796
1797                 abort = nfs4_start_recovery(ep, mi, NULL,
1798                     NULL, NULL, NULL, OP_GETFH, NULL, NULL, NULL);
1799                 if (!ep->error) {
1800                         ep->error = geterrno4(res.status);
1801                         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1802                 }
1803                 nfs4args_lookup_free(argop, num_argops);
1804                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1805                 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
1806                 /* have another go? */
1807                 if (abort == FALSE)
1808                         goto recov_retry;
1809                 return;
1810         }
1811
1812         /*
1813          * No recovery, but check if error is set.
1814          */
1815         if (ep->error)  {
1816                 nfs4args_lookup_free(argop, num_argops);
1817                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1818                 if (!recovery)
1819                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1820                             needrecov);
1821                 return;
1822         }
1823
1824 is_link_err:
1825
1826         /* for non-recovery errors */
1827         if (res.status && res.status != NFS4ERR_SYMLINK &&
1828             res.status != NFS4ERR_MOVED) {
1829                 if (!recovery) {
1830                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1831                             needrecov);
1832                 }
1833                 nfs4args_lookup_free(argop, num_argops);
1834                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1835                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1836                 return;
1837         }
1838
1839         /*
1840          * If any intermediate component in the path is a symbolic link,
1841          * resolve the symlink, then try mount again using the new path.
1842          */
1843         if (res.status == NFS4ERR_SYMLINK || res.status == NFS4ERR_MOVED) {
1844                 int where;
1845
1846                 /*
1847                  * Need to call nfs4_end_op before resolve_sympath to avoid
1848                  * potential nfs4_start_op deadlock.
1849                  */
1850                 if (!recovery)
1851                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1852                             needrecov);
1853
1854                 /*
1855                  * This must be from OP_LOOKUP failure. The (cfh) for this
1856                  * OP_LOOKUP is a symlink node. Found out where the
1857                  * OP_GETFH is for the (cfh) that is a symlink node.
1858                  *
1859                  * Example:
1860                  * (mount) PUTROOTFH, GETFH, LOOKUP comp1, GETFH, GETATTR,
1861                  * LOOKUP comp2, GETFH, GETATTR, LOOKUP comp3, GETFH, GETATTR
1862                  *
1863                  * LOOKUP comp3 fails with SYMLINK because comp2 is a symlink.
1864                  * In this case, where = 7, nthcomp = 2.
1865                  */
1866                 where = res.array_len - 2;
1867                 ASSERT(where > 0);
1868
1869                 if (res.status == NFS4ERR_SYMLINK) {
1870
1871                         resop = &res.array[where - 1];
1872                         ASSERT(resop->resop == OP_GETFH);
1873                         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1874                         nthcomp = res.array_len/3 - 1;
1875                         ep->error = resolve_sympath(mi, svp, nthcomp,
1876                             tmpfhp, cr, flags);
1877
1878                 } else if (res.status == NFS4ERR_MOVED) {
1879
1880                         resop = &res.array[where - 2];
1881                         ASSERT(resop->resop == OP_GETFH);
1882                         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1883                         nthcomp = res.array_len/3 - 1;
1884                         ep->error = resolve_referral(mi, svp, cr, nthcomp,
1885                             tmpfhp);
1886                 }
1887
1888                 nfs4args_lookup_free(argop, num_argops);
1889                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1890                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1891
1892                 if (ep->error)
1893                         return;
1894
1895                 goto recov_retry;
1896         }
1897
1898         /* getfh */
1899         resop = &res.array[res.array_len - 2];
1900         ASSERT(resop->resop == OP_GETFH);
1901         resfhp = &resop->nfs_resop4_u.opgetfh.object;
1902
1903         /* getattr fsinfo res */
1904         resop++;
1905         garp = &resop->nfs_resop4_u.opgetattr.ga_res;
1906
1907         *vtp = garp->n4g_va.va_type;
1908
1909         mi->mi_fh_expire_type = garp->n4g_ext_res->n4g_fet;
1910
1911         mutex_enter(&mi->mi_lock);
1912         if (garp->n4g_ext_res->n4g_pc4.pc4_link_support)
1913                 mi->mi_flags |= MI4_LINK;
1914         if (garp->n4g_ext_res->n4g_pc4.pc4_symlink_support)
1915                 mi->mi_flags |= MI4_SYMLINK;
1916         if (garp->n4g_ext_res->n4g_suppattrs & FATTR4_ACL_MASK)
1917                 mi->mi_flags |= MI4_ACL;
1918         mutex_exit(&mi->mi_lock);
1919
1920         if (garp->n4g_ext_res->n4g_maxread == 0)
1921                 mi->mi_tsize =
1922                     MIN(MAXBSIZE, mi->mi_tsize);
1923         else
1924                 mi->mi_tsize =
1925                     MIN(garp->n4g_ext_res->n4g_maxread,
1926                     mi->mi_tsize);
1927
1928         if (garp->n4g_ext_res->n4g_maxwrite == 0)
1929                 mi->mi_stsize =
1930                     MIN(MAXBSIZE, mi->mi_stsize);
1931         else
1932                 mi->mi_stsize =
1933                     MIN(garp->n4g_ext_res->n4g_maxwrite,
1934                     mi->mi_stsize);
1935
1936         if (garp->n4g_ext_res->n4g_maxfilesize != 0)
1937                 mi->mi_maxfilesize =
1938                     MIN(garp->n4g_ext_res->n4g_maxfilesize,
1939                     mi->mi_maxfilesize);
1940
1941         /*
1942          * If the final component is a a symbolic link, resolve the symlink,
1943          * then try mount again using the new path.
1944          *
1945          * Assume no symbolic link for root filesysm "/".
1946          */
1947         if (*vtp == VLNK) {
1948                 /*
1949                  * nthcomp is the total result length minus
1950                  * the 1st 2 OPs (PUTROOTFH, GETFH),
1951                  * then divided by 3 (LOOKUP,GETFH,GETATTR)
1952                  *
1953                  * e.g. PUTROOTFH GETFH LOOKUP 1st-comp GETFH GETATTR
1954                  *      LOOKUP 2nd-comp GETFH GETATTR
1955                  *
1956                  *      (8 - 2)/3 = 2
1957                  */
1958                 nthcomp = (res.array_len - 2)/3;
1959
1960                 /*
1961                  * Need to call nfs4_end_op before resolve_sympath to avoid
1962                  * potential nfs4_start_op deadlock. See RFE 4777612.
1963                  */
1964                 if (!recovery)
1965                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1966                             needrecov);
1967
1968                 ep->error = resolve_sympath(mi, svp, nthcomp, resfhp, cr,
1969                     flags);
1970
1971                 nfs4args_lookup_free(argop, num_argops);
1972                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1973                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1974
1975                 if (ep->error)
1976                         return;
1977
1978                 goto recov_retry;
1979         }
1980
1981         /*
1982          * We need to figure out where in the compound the getfh
1983          * for the parent directory is. If the object to be mounted is
1984          * the root, then there is no lookup at all:
1985          * PUTROOTFH, GETFH.
1986          * If the object to be mounted is in the root, then the compound is:
1987          * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR.
1988          * In either of these cases, the index of the GETFH is 1.
1989          * If it is not at the root, then it's something like:
1990          * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR,
1991          * LOOKUP, GETFH, GETATTR
1992          * In this case, the index is llndx (last lookup index) - 2.
1993          */
1994         if (llndx == -1 || llndx == 2)
1995                 resop = &res.array[1];
1996         else {
1997                 ASSERT(llndx > 2);
1998                 resop = &res.array[llndx-2];
1999         }
2000
2001         ASSERT(resop->resop == OP_GETFH);
2002         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
2003
2004         /* save the filehandles for the replica */
2005         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2006         ASSERT(tmpfhp->nfs_fh4_len <= NFS4_FHSIZE);
2007         svp->sv_pfhandle.fh_len = tmpfhp->nfs_fh4_len;
2008         bcopy(tmpfhp->nfs_fh4_val, svp->sv_pfhandle.fh_buf,
2009             tmpfhp->nfs_fh4_len);
2010         ASSERT(resfhp->nfs_fh4_len <= NFS4_FHSIZE);
2011         svp->sv_fhandle.fh_len = resfhp->nfs_fh4_len;
2012         bcopy(resfhp->nfs_fh4_val, svp->sv_fhandle.fh_buf, resfhp->nfs_fh4_len);
2013
2014         /* initialize fsid and supp_attrs for server fs */
2015         svp->sv_fsid = garp->n4g_fsid;
2016         svp->sv_supp_attrs =
2017             garp->n4g_ext_res->n4g_suppattrs | FATTR4_MANDATTR_MASK;
2018
2019         nfs_rw_exit(&svp->sv_lock);
2020         nfs4args_lookup_free(argop, num_argops);
2021         kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
2022         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2023         if (!recovery)
2024                 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
2025 }
2026
2027 /*
2028  * Save a copy of Servinfo4_t structure.
2029  * We might need when there is a failure in getting file handle
2030  * in case of a referral to replace servinfo4 struct and try again.
2031  */
2032 static struct servinfo4 *
2033 copy_svp(servinfo4_t *nsvp)
2034 {
2035         servinfo4_t *svp = NULL;
2036         struct knetconfig *sknconf, *tknconf;
2037         struct netbuf *saddr, *taddr;
2038
2039         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2040         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2041         svp->sv_flags = nsvp->sv_flags;
2042         svp->sv_fsid = nsvp->sv_fsid;
2043         svp->sv_hostnamelen = nsvp->sv_hostnamelen;
2044         svp->sv_pathlen = nsvp->sv_pathlen;
2045         svp->sv_supp_attrs = nsvp->sv_supp_attrs;
2046
2047         svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
2048         svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
2049         bcopy(nsvp->sv_hostname, svp->sv_hostname, svp->sv_hostnamelen);
2050         bcopy(nsvp->sv_path, svp->sv_path, svp->sv_pathlen);
2051
2052         saddr = &nsvp->sv_addr;
2053         taddr = &svp->sv_addr;
2054         taddr->maxlen = saddr->maxlen;
2055         taddr->len = saddr->len;
2056         if (saddr->len > 0) {
2057                 taddr->buf = kmem_zalloc(saddr->maxlen, KM_SLEEP);
2058                 bcopy(saddr->buf, taddr->buf, saddr->len);
2059         }
2060
2061         svp->sv_knconf = kmem_zalloc(sizeof (struct knetconfig), KM_SLEEP);
2062         sknconf = nsvp->sv_knconf;
2063         tknconf = svp->sv_knconf;
2064         tknconf->knc_semantics = sknconf->knc_semantics;
2065         tknconf->knc_rdev = sknconf->knc_rdev;
2066         if (sknconf->knc_proto != NULL) {
2067                 tknconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2068                 bcopy(sknconf->knc_proto, (char *)tknconf->knc_proto,
2069                     KNC_STRSIZE);
2070         }
2071         if (sknconf->knc_protofmly != NULL) {
2072                 tknconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2073                 bcopy(sknconf->knc_protofmly, (char *)tknconf->knc_protofmly,
2074                     KNC_STRSIZE);
2075         }
2076
2077         if (nsvp->sv_origknconf != NULL) {
2078                 svp->sv_origknconf = kmem_zalloc(sizeof (struct knetconfig),
2079                     KM_SLEEP);
2080                 sknconf = nsvp->sv_origknconf;
2081                 tknconf = svp->sv_origknconf;
2082                 tknconf->knc_semantics = sknconf->knc_semantics;
2083                 tknconf->knc_rdev = sknconf->knc_rdev;
2084                 if (sknconf->knc_proto != NULL) {
2085                         tknconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2086                         bcopy(sknconf->knc_proto, (char *)tknconf->knc_proto,
2087                             KNC_STRSIZE);
2088                 }
2089                 if (sknconf->knc_protofmly != NULL) {
2090                         tknconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE,
2091                             KM_SLEEP);
2092                         bcopy(sknconf->knc_protofmly,
2093                             (char *)tknconf->knc_protofmly, KNC_STRSIZE);
2094                 }
2095         }
2096
2097         svp->sv_secdata = copy_sec_data(nsvp->sv_secdata);
2098         svp->sv_dhsec = copy_sec_data(svp->sv_dhsec);
2099         /*
2100          * Rest of the security information is not copied as they are built
2101          * with the information available from secdata and dhsec.
2102          */
2103         svp->sv_next = NULL;
2104
2105         return (svp);
2106 }
2107
2108 servinfo4_t *
2109 restore_svp(mntinfo4_t *mi, servinfo4_t *svp, servinfo4_t *origsvp)
2110 {
2111         servinfo4_t *srvnext, *tmpsrv;
2112
2113         if (strcmp(svp->sv_hostname, origsvp->sv_hostname) != 0) {
2114                 /*
2115                  * Since the hostname changed, we must be dealing
2116                  * with a referral, and the lookup failed.  We will
2117                  * restore the whole servinfo4_t to what it was before.
2118                  */
2119                 srvnext = svp->sv_next;
2120                 svp->sv_next = NULL;
2121                 tmpsrv = copy_svp(origsvp);
2122                 sv4_free(svp);
2123                 svp = tmpsrv;
2124                 svp->sv_next = srvnext;
2125                 mutex_enter(&mi->mi_lock);
2126                 mi->mi_servers = svp;
2127                 mi->mi_curr_serv = svp;
2128                 mutex_exit(&mi->mi_lock);
2129
2130         } else if (origsvp->sv_pathlen != svp->sv_pathlen) {
2131
2132                 /*
2133                  * For symlink case: restore original path because
2134                  * it might have contained symlinks that were
2135                  * expanded by nfsgetfh_otw before the failure occurred.
2136                  */
2137                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2138                 kmem_free(svp->sv_path, svp->sv_pathlen);
2139                 svp->sv_path =
2140                     kmem_alloc(origsvp->sv_pathlen, KM_SLEEP);
2141                 svp->sv_pathlen = origsvp->sv_pathlen;
2142                 bcopy(origsvp->sv_path, svp->sv_path,
2143                     origsvp->sv_pathlen);
2144                 nfs_rw_exit(&svp->sv_lock);
2145         }
2146         return (svp);
2147 }
2148
2149 static ushort_t nfs4_max_threads = 8;   /* max number of active async threads */
2150 uint_t nfs4_bsize = 32 * 1024;  /* client `block' size */
2151 static uint_t nfs4_async_clusters = 1;  /* # of reqs from each async queue */
2152 static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO;
2153
2154 /*
2155  * Remap the root filehandle for the given filesystem.
2156  *
2157  * results returned via the nfs4_error_t parameter.
2158  */
2159 void
2160 nfs4_remap_root(mntinfo4_t *mi, nfs4_error_t *ep, int flags)
2161 {
2162         struct servinfo4 *svp, *origsvp;
2163         vtype_t vtype;
2164         nfs_fh4 rootfh;
2165         int getfh_flags;
2166         int num_retry;
2167
2168         mutex_enter(&mi->mi_lock);
2169
2170 remap_retry:
2171         svp = mi->mi_curr_serv;
2172         getfh_flags =
2173             (flags & NFS4_REMAP_NEEDSOP) ? NFS4_GETFH_NEEDSOP : 0;
2174         getfh_flags |=
2175             (mi->mi_flags & MI4_PUBLIC) ? NFS4_GETFH_PUBLIC : 0;
2176         mutex_exit(&mi->mi_lock);
2177
2178         /*
2179          * Just in case server path being mounted contains
2180          * symlinks and fails w/STALE, save the initial sv_path
2181          * so we can redrive the initial mount compound with the
2182          * initial sv_path -- not a symlink-expanded version.
2183          *
2184          * This could only happen if a symlink was expanded
2185          * and the expanded mount compound failed stale.  Because
2186          * it could be the case that the symlink was removed at
2187          * the server (and replaced with another symlink/dir,
2188          * we need to use the initial sv_path when attempting
2189          * to re-lookup everything and recover.
2190          */
2191         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2192         origsvp = copy_svp(svp);
2193         nfs_rw_exit(&svp->sv_lock);
2194
2195         num_retry = nfs4_max_mount_retry;
2196
2197         do {
2198                 /*
2199                  * Get the root fh from the server.  Retry nfs4_max_mount_retry
2200                  * (2) times if it fails with STALE since the recovery
2201                  * infrastructure doesn't do STALE recovery for components
2202                  * of the server path to the object being mounted.
2203                  */
2204                 nfs4getfh_otw(mi, svp, &vtype, getfh_flags, CRED(), ep);
2205
2206                 if (ep->error == 0 && ep->stat == NFS4_OK)
2207                         break;
2208
2209                 /*
2210                  * For some reason, the mount compound failed.  Before
2211                  * retrying, we need to restore original conditions.
2212                  */
2213                 svp = restore_svp(mi, svp, origsvp);
2214
2215         } while (num_retry-- > 0);
2216
2217         sv4_free(origsvp);
2218
2219         if (ep->error != 0 || ep->stat != 0) {
2220                 return;
2221         }
2222
2223         if (vtype != VNON && vtype != mi->mi_type) {
2224                 /* shouldn't happen */
2225                 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2226                     "nfs4_remap_root: server root vnode type (%d) doesn't "
2227                     "match mount info (%d)", vtype, mi->mi_type);
2228         }
2229
2230         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2231         rootfh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2232         rootfh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2233         nfs_rw_exit(&svp->sv_lock);
2234         sfh4_update(mi->mi_rootfh, &rootfh);
2235
2236         /*
2237          * It's possible that recovery took place on the filesystem
2238          * and the server has been updated between the time we did
2239          * the nfs4getfh_otw and now. Re-drive the otw operation
2240          * to make sure we have a good fh.
2241          */
2242         mutex_enter(&mi->mi_lock);
2243         if (mi->mi_curr_serv != svp)
2244                 goto remap_retry;
2245
2246         mutex_exit(&mi->mi_lock);
2247 }
2248
2249 static int
2250 nfs4rootvp(vnode_t **rtvpp, vfs_t *vfsp, struct servinfo4 *svp_head,
2251     int flags, cred_t *cr, zone_t *zone)
2252 {
2253         vnode_t *rtvp = NULL;
2254         mntinfo4_t *mi;
2255         dev_t nfs_dev;
2256         int error = 0;
2257         rnode4_t *rp;
2258         int i, len;
2259         struct vattr va;
2260         vtype_t vtype = VNON;
2261         vtype_t tmp_vtype = VNON;
2262         struct servinfo4 *firstsvp = NULL, *svp = svp_head;
2263         nfs4_oo_hash_bucket_t *bucketp;
2264         nfs_fh4 fh;
2265         char *droptext = "";
2266         struct nfs_stats *nfsstatsp;
2267         nfs4_fname_t *mfname;
2268         nfs4_error_t e;
2269         int num_retry, removed;
2270         cred_t *lcr = NULL, *tcr = cr;
2271         struct servinfo4 *origsvp;
2272         char *resource;
2273
2274         nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
2275         ASSERT(nfsstatsp != NULL);
2276
2277         ASSERT(nfs_zone() == zone);
2278         ASSERT(crgetref(cr));
2279
2280         /*
2281          * Create a mount record and link it to the vfs struct.
2282          */
2283         mi = kmem_zalloc(sizeof (*mi), KM_SLEEP);
2284         mutex_init(&mi->mi_lock, NULL, MUTEX_DEFAULT, NULL);
2285         nfs_rw_init(&mi->mi_recovlock, NULL, RW_DEFAULT, NULL);
2286         nfs_rw_init(&mi->mi_rename_lock, NULL, RW_DEFAULT, NULL);
2287         nfs_rw_init(&mi->mi_fh_lock, NULL, RW_DEFAULT, NULL);
2288
2289         if (!(flags & NFSMNT_SOFT))
2290                 mi->mi_flags |= MI4_HARD;
2291         if ((flags & NFSMNT_NOPRINT))
2292                 mi->mi_flags |= MI4_NOPRINT;
2293         if (flags & NFSMNT_INT)
2294                 mi->mi_flags |= MI4_INT;
2295         if (flags & NFSMNT_PUBLIC)
2296                 mi->mi_flags |= MI4_PUBLIC;
2297         if (flags & NFSMNT_MIRRORMOUNT)
2298                 mi->mi_flags |= MI4_MIRRORMOUNT;
2299         if (flags & NFSMNT_REFERRAL)
2300                 mi->mi_flags |= MI4_REFERRAL;
2301         mi->mi_retrans = NFS_RETRIES;
2302         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
2303             svp->sv_knconf->knc_semantics == NC_TPI_COTS)
2304                 mi->mi_timeo = nfs4_cots_timeo;
2305         else
2306                 mi->mi_timeo = NFS_TIMEO;
2307         mi->mi_prog = NFS_PROGRAM;
2308         mi->mi_vers = NFS_V4;
2309         mi->mi_rfsnames = rfsnames_v4;
2310         mi->mi_reqs = nfsstatsp->nfs_stats_v4.rfsreqcnt_ptr;
2311         cv_init(&mi->mi_failover_cv, NULL, CV_DEFAULT, NULL);
2312         mi->mi_servers = svp;
2313         mi->mi_curr_serv = svp;
2314         mi->mi_acregmin = SEC2HR(ACREGMIN);
2315         mi->mi_acregmax = SEC2HR(ACREGMAX);
2316         mi->mi_acdirmin = SEC2HR(ACDIRMIN);
2317         mi->mi_acdirmax = SEC2HR(ACDIRMAX);
2318         mi->mi_fh_expire_type = FH4_PERSISTENT;
2319         mi->mi_clientid_next = NULL;
2320         mi->mi_clientid_prev = NULL;
2321         mi->mi_srv = NULL;
2322         mi->mi_grace_wait = 0;
2323         mi->mi_error = 0;
2324         mi->mi_srvsettime = 0;
2325         mi->mi_srvset_cnt = 0;
2326
2327         mi->mi_count = 1;
2328
2329         mi->mi_tsize = nfs4_tsize(svp->sv_knconf);
2330         mi->mi_stsize = mi->mi_tsize;
2331
2332         if (flags & NFSMNT_DIRECTIO)
2333                 mi->mi_flags |= MI4_DIRECTIO;
2334
2335         mi->mi_flags |= MI4_MOUNTING;
2336
2337         /*
2338          * Make a vfs struct for nfs.  We do this here instead of below
2339          * because rtvp needs a vfs before we can do a getattr on it.
2340          *
2341          * Assign a unique device id to the mount
2342          */
2343         mutex_enter(&nfs_minor_lock);
2344         do {
2345                 nfs_minor = (nfs_minor + 1) & MAXMIN32;
2346                 nfs_dev = makedevice(nfs_major, nfs_minor);
2347         } while (vfs_devismounted(nfs_dev));
2348         mutex_exit(&nfs_minor_lock);
2349
2350         vfsp->vfs_dev = nfs_dev;
2351         vfs_make_fsid(&vfsp->vfs_fsid, nfs_dev, nfs4fstyp);
2352         vfsp->vfs_data = (caddr_t)mi;
2353         vfsp->vfs_fstype = nfsfstyp;
2354         vfsp->vfs_bsize = nfs4_bsize;
2355
2356         /*
2357          * Initialize fields used to support async putpage operations.
2358          */
2359         for (i = 0; i < NFS4_ASYNC_TYPES; i++)
2360                 mi->mi_async_clusters[i] = nfs4_async_clusters;
2361         mi->mi_async_init_clusters = nfs4_async_clusters;
2362         mi->mi_async_curr[NFS4_ASYNC_QUEUE] =
2363             mi->mi_async_curr[NFS4_ASYNC_PGOPS_QUEUE] = &mi->mi_async_reqs[0];
2364         mi->mi_max_threads = nfs4_max_threads;
2365         mutex_init(&mi->mi_async_lock, NULL, MUTEX_DEFAULT, NULL);
2366         cv_init(&mi->mi_async_reqs_cv, NULL, CV_DEFAULT, NULL);
2367         cv_init(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE], NULL, CV_DEFAULT,
2368             NULL);
2369         cv_init(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE], NULL,
2370             CV_DEFAULT, NULL);
2371         cv_init(&mi->mi_async_cv, NULL, CV_DEFAULT, NULL);
2372         cv_init(&mi->mi_inact_req_cv, NULL, CV_DEFAULT, NULL);
2373
2374         mi->mi_vfsp = vfsp;
2375         mi->mi_zone = zone;
2376         zone_init_ref(&mi->mi_zone_ref);
2377         zone_hold_ref(zone, &mi->mi_zone_ref, ZONE_REF_NFSV4);
2378         nfs4_mi_zonelist_add(mi);
2379
2380         /*
2381          * Initialize the <open owner/cred> hash table.
2382          */
2383         for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
2384                 bucketp = &(mi->mi_oo_list[i]);
2385                 mutex_init(&bucketp->b_lock, NULL, MUTEX_DEFAULT, NULL);
2386                 list_create(&bucketp->b_oo_hash_list,
2387                     sizeof (nfs4_open_owner_t),
2388                     offsetof(nfs4_open_owner_t, oo_hash_node));
2389         }
2390
2391         /*
2392          * Initialize the freed open owner list.
2393          */
2394         mi->mi_foo_num = 0;
2395         mi->mi_foo_max = NFS4_NUM_FREED_OPEN_OWNERS;
2396         list_create(&mi->mi_foo_list, sizeof (nfs4_open_owner_t),
2397             offsetof(nfs4_open_owner_t, oo_foo_node));
2398
2399         list_create(&mi->mi_lost_state, sizeof (nfs4_lost_rqst_t),
2400             offsetof(nfs4_lost_rqst_t, lr_node));
2401
2402         list_create(&mi->mi_bseqid_list, sizeof (nfs4_bseqid_entry_t),
2403             offsetof(nfs4_bseqid_entry_t, bs_node));
2404
2405         /*
2406          * Initialize the msg buffer.
2407          */
2408         list_create(&mi->mi_msg_list, sizeof (nfs4_debug_msg_t),
2409             offsetof(nfs4_debug_msg_t, msg_node));
2410         mi->mi_msg_count = 0;
2411         mutex_init(&mi->mi_msg_list_lock, NULL, MUTEX_DEFAULT, NULL);
2412
2413         /*
2414          * Initialize kstats
2415          */
2416         nfs4_mnt_kstat_init(vfsp);
2417
2418         /*
2419          * Initialize the shared filehandle pool.
2420          */
2421         sfh4_createtab(&mi->mi_filehandles);
2422
2423         /*
2424          * Save server path we're attempting to mount.
2425          */
2426         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2427         origsvp = copy_svp(svp);
2428         nfs_rw_exit(&svp->sv_lock);
2429
2430         /*
2431          * Make the GETFH call to get root fh for each replica.
2432          */
2433         if (svp_head->sv_next)
2434                 droptext = ", dropping replica";
2435
2436         /*
2437          * If the uid is set then set the creds for secure mounts
2438          * by proxy processes such as automountd.
2439          */
2440         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2441         if (svp->sv_secdata->uid != 0 &&
2442             svp->sv_secdata->rpcflavor == RPCSEC_GSS) {
2443                 lcr = crdup(cr);
2444                 (void) crsetugid(lcr, svp->sv_secdata->uid, crgetgid(cr));
2445                 tcr = lcr;
2446         }
2447         nfs_rw_exit(&svp->sv_lock);
2448         for (svp = svp_head; svp; svp = svp->sv_next) {
2449                 if (nfs4_chkdup_servinfo4(svp_head, svp)) {
2450                         nfs_cmn_err(error, CE_WARN,
2451                             VERS_MSG "Host %s is a duplicate%s",
2452                             svp->sv_hostname, droptext);
2453                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2454                         svp->sv_flags |= SV4_NOTINUSE;
2455                         nfs_rw_exit(&svp->sv_lock);
2456                         continue;
2457                 }
2458                 mi->mi_curr_serv = svp;
2459
2460                 /*
2461                  * Just in case server path being mounted contains
2462                  * symlinks and fails w/STALE, save the initial sv_path
2463                  * so we can redrive the initial mount compound with the
2464                  * initial sv_path -- not a symlink-expanded version.
2465                  *
2466                  * This could only happen if a symlink was expanded
2467                  * and the expanded mount compound failed stale.  Because
2468                  * it could be the case that the symlink was removed at
2469                  * the server (and replaced with another symlink/dir,
2470                  * we need to use the initial sv_path when attempting
2471                  * to re-lookup everything and recover.
2472                  *
2473                  * Other mount errors should evenutally be handled here also
2474                  * (NFS4ERR_DELAY, NFS4ERR_RESOURCE).  For now, all mount
2475                  * failures will result in mount being redriven a few times.
2476                  */
2477                 num_retry = nfs4_max_mount_retry;
2478                 do {
2479                         nfs4getfh_otw(mi, svp, &tmp_vtype,
2480                             ((flags & NFSMNT_PUBLIC) ? NFS4_GETFH_PUBLIC : 0) |
2481                             NFS4_GETFH_NEEDSOP, tcr, &e);
2482
2483                         if (e.error == 0 && e.stat == NFS4_OK)
2484                                 break;
2485
2486                         /*
2487                          * For some reason, the mount compound failed.  Before
2488                          * retrying, we need to restore original conditions.
2489                          */
2490                         svp = restore_svp(mi, svp, origsvp);
2491                         svp_head = svp;
2492
2493                 } while (num_retry-- > 0);
2494                 error = e.error ? e.error : geterrno4(e.stat);
2495                 if (error) {
2496                         nfs_cmn_err(error, CE_WARN,
2497                             VERS_MSG "initial call to %s failed%s: %m",
2498                             svp->sv_hostname, droptext);
2499                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2500                         svp->sv_flags |= SV4_NOTINUSE;
2501                         nfs_rw_exit(&svp->sv_lock);
2502                         mi->mi_flags &= ~MI4_RECOV_FAIL;
2503                         mi->mi_error = 0;
2504                         continue;
2505                 }
2506
2507                 if (tmp_vtype == VBAD) {
2508                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2509                             VERS_MSG "%s returned a bad file type for "
2510                             "root%s", svp->sv_hostname, droptext);
2511                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2512                         svp->sv_flags |= SV4_NOTINUSE;
2513                         nfs_rw_exit(&svp->sv_lock);
2514                         continue;
2515                 }
2516
2517                 if (vtype == VNON) {
2518                         vtype = tmp_vtype;
2519                 } else if (vtype != tmp_vtype) {
2520                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2521                             VERS_MSG "%s returned a different file type "
2522                             "for root%s", svp->sv_hostname, droptext);
2523                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2524                         svp->sv_flags |= SV4_NOTINUSE;
2525                         nfs_rw_exit(&svp->sv_lock);
2526                         continue;
2527                 }
2528                 if (firstsvp == NULL)
2529                         firstsvp = svp;
2530         }
2531
2532         if (firstsvp == NULL) {
2533                 if (error == 0)
2534                         error = ENOENT;
2535                 goto bad;
2536         }
2537
2538         mi->mi_curr_serv = svp = firstsvp;
2539         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2540         ASSERT((mi->mi_curr_serv->sv_flags & SV4_NOTINUSE) == 0);
2541         fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2542         fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2543         mi->mi_rootfh = sfh4_get(&fh, mi);
2544         fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
2545         fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
2546         mi->mi_srvparentfh = sfh4_get(&fh, mi);
2547         nfs_rw_exit(&svp->sv_lock);
2548
2549         /*
2550          * Get the fname for filesystem root.
2551          */
2552         mi->mi_fname = fn_get(NULL, ".", mi->mi_rootfh);
2553         mfname = mi->mi_fname;
2554         fn_hold(mfname);
2555
2556         /*
2557          * Make the root vnode without attributes.
2558          */
2559         rtvp = makenfs4node_by_fh(mi->mi_rootfh, NULL,
2560             &mfname, NULL, mi, cr, gethrtime());
2561         rtvp->v_type = vtype;
2562
2563         mi->mi_curread = mi->mi_tsize;
2564         mi->mi_curwrite = mi->mi_stsize;
2565
2566         /*
2567          * Start the manager thread responsible for handling async worker
2568          * threads.
2569          */
2570         MI4_HOLD(mi);
2571         VFS_HOLD(vfsp); /* add reference for thread */
2572         mi->mi_manager_thread = zthread_create(NULL, 0, nfs4_async_manager,
2573             vfsp, 0, minclsyspri);
2574         ASSERT(mi->mi_manager_thread != NULL);
2575
2576         /*
2577          * Create the thread that handles over-the-wire calls for
2578          * fop_inactive.
2579          * This needs to happen after the manager thread is created.
2580          */
2581         MI4_HOLD(mi);
2582         mi->mi_inactive_thread = zthread_create(NULL, 0, nfs4_inactive_thread,
2583             mi, 0, minclsyspri);
2584         ASSERT(mi->mi_inactive_thread != NULL);
2585
2586         /* If we didn't get a type, get one now */
2587         if (rtvp->v_type == VNON) {
2588                 va.va_mask = AT_TYPE;
2589                 error = nfs4getattr(rtvp, &va, tcr);
2590                 if (error)
2591                         goto bad;
2592                 rtvp->v_type = va.va_type;
2593         }
2594
2595         mi->mi_type = rtvp->v_type;
2596
2597         mutex_enter(&mi->mi_lock);
2598         mi->mi_flags &= ~MI4_MOUNTING;
2599         mutex_exit(&mi->mi_lock);
2600
2601         /* Update VFS with new server and path info */
2602         if ((strcmp(svp->sv_hostname, origsvp->sv_hostname) != 0) ||
2603             (strcmp(svp->sv_path, origsvp->sv_path) != 0)) {
2604                 len = svp->sv_hostnamelen + svp->sv_pathlen;
2605                 resource = kmem_zalloc(len, KM_SLEEP);
2606                 (void) strcat(resource, svp->sv_hostname);
2607                 (void) strcat(resource, ":");
2608                 (void) strcat(resource, svp->sv_path);
2609                 vfs_setresource(vfsp, resource, 0);
2610                 kmem_free(resource, len);
2611         }
2612
2613         sv4_free(origsvp);
2614         *rtvpp = rtvp;
2615         if (lcr != NULL)
2616                 crfree(lcr);
2617
2618         return (0);
2619 bad:
2620         /*
2621          * An error occurred somewhere, need to clean up...
2622          */
2623         if (lcr != NULL)
2624                 crfree(lcr);
2625
2626         if (rtvp != NULL) {
2627                 /*
2628                  * We need to release our reference to the root vnode and
2629                  * destroy the mntinfo4 struct that we just created.
2630                  */
2631                 rp = VTOR4(rtvp);
2632                 if (rp->r_flags & R4HASHED)
2633                         rp4_rmhash(rp);
2634                 VN_RELE(rtvp);
2635         }
2636         nfs4_async_stop(vfsp);
2637         nfs4_async_manager_stop(vfsp);
2638         removed = nfs4_mi_zonelist_remove(mi);
2639         if (removed)
2640                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2641
2642         /*
2643          * This releases the initial "hold" of the mi since it will never
2644          * be referenced by the vfsp.  Also, when mount returns to vfs.c
2645          * with an error, the vfsp will be destroyed, not rele'd.
2646          */
2647         MI4_RELE(mi);
2648
2649         if (origsvp != NULL)
2650                 sv4_free(origsvp);
2651
2652         *rtvpp = NULL;
2653         return (error);
2654 }
2655
2656 /*
2657  * vfs operations
2658  */
2659 static int
2660 nfs4_unmount(vfs_t *vfsp, int flag, cred_t *cr)
2661 {
2662         mntinfo4_t              *mi;
2663         ushort_t                omax;
2664         int                     removed;
2665
2666         bool_t                  must_unlock;
2667
2668         nfs4_ephemeral_tree_t   *eph_tree;
2669
2670         if (secpolicy_fs_unmount(cr, vfsp) != 0)
2671                 return (EPERM);
2672
2673         mi = VFTOMI4(vfsp);
2674
2675         if (flag & MS_FORCE) {
2676                 vfsp->vfs_flag |= VFS_UNMOUNTED;
2677                 if (nfs_zone() != mi->mi_zone) {
2678                         /*
2679                          * If the request is coming from the wrong zone,
2680                          * we don't want to create any new threads, and
2681                          * performance is not a concern.  Do everything
2682                          * inline.
2683                          */
2684                         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2685                             "nfs4_unmount x-zone forced unmount of vfs %p\n",
2686                             (void *)vfsp));
2687                         nfs4_free_mount(vfsp, flag, cr);
2688                 } else {
2689                         /*
2690                          * Free data structures asynchronously, to avoid
2691                          * blocking the current thread (for performance
2692                          * reasons only).
2693                          */
2694                         async_free_mount(vfsp, flag, cr);
2695                 }
2696
2697                 return (0);
2698         }
2699
2700         /*
2701          * Wait until all asynchronous putpage operations on
2702          * this file system are complete before flushing rnodes
2703          * from the cache.
2704          */
2705         omax = mi->mi_max_threads;
2706         if (nfs4_async_stop_sig(vfsp))
2707                 return (EINTR);
2708
2709         r4flush(vfsp, cr);
2710
2711         /*
2712          * About the only reason that this would fail would be
2713          * that the harvester is already busy tearing down this
2714          * node. So we fail back to the caller and let them try
2715          * again when needed.
2716          */
2717         if (nfs4_ephemeral_umount(mi, flag, cr,
2718             &must_unlock, &eph_tree)) {
2719                 ASSERT(must_unlock == FALSE);
2720                 mutex_enter(&mi->mi_async_lock);
2721                 mi->mi_max_threads = omax;
2722                 mutex_exit(&mi->mi_async_lock);
2723
2724                 return (EBUSY);
2725         }
2726
2727         /*
2728          * If there are any active vnodes on this file system,
2729          * then the file system is busy and can't be unmounted.
2730          */
2731         if (check_rtable4(vfsp)) {
2732                 nfs4_ephemeral_umount_unlock(&must_unlock, &eph_tree);
2733
2734                 mutex_enter(&mi->mi_async_lock);
2735                 mi->mi_max_threads = omax;
2736                 mutex_exit(&mi->mi_async_lock);
2737
2738                 return (EBUSY);
2739         }
2740
2741         /*
2742          * The unmount can't fail from now on, so record any
2743          * ephemeral changes.
2744          */
2745         nfs4_ephemeral_umount_activate(mi, &must_unlock, &eph_tree);
2746
2747         /*
2748          * There are no active files that could require over-the-wire
2749          * calls to the server, so stop the async manager and the
2750          * inactive thread.
2751          */
2752         nfs4_async_manager_stop(vfsp);
2753
2754         /*
2755          * Destroy all rnodes belonging to this file system from the
2756          * rnode hash queues and purge any resources allocated to
2757          * them.
2758          */
2759         destroy_rtable4(vfsp, cr);
2760         vfsp->vfs_flag |= VFS_UNMOUNTED;
2761
2762         nfs4_remove_mi_from_server(mi, NULL);
2763         removed = nfs4_mi_zonelist_remove(mi);
2764         if (removed)
2765                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2766
2767         return (0);
2768 }
2769
2770 /*
2771  * find root of nfs
2772  */
2773 static int
2774 nfs4_root(vfs_t *vfsp, vnode_t **vpp)
2775 {
2776         mntinfo4_t *mi;
2777         vnode_t *vp;
2778         nfs4_fname_t *mfname;
2779         servinfo4_t *svp;
2780
2781         mi = VFTOMI4(vfsp);
2782
2783         if (nfs_zone() != mi->mi_zone)
2784                 return (EPERM);
2785
2786         svp = mi->mi_curr_serv;
2787         if (svp) {
2788                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2789                 if (svp->sv_flags & SV4_ROOT_STALE) {
2790                         nfs_rw_exit(&svp->sv_lock);
2791
2792                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2793                         if (svp->sv_flags & SV4_ROOT_STALE) {
2794                                 svp->sv_flags &= ~SV4_ROOT_STALE;
2795                                 nfs_rw_exit(&svp->sv_lock);
2796                                 return (ENOENT);
2797                         }
2798                         nfs_rw_exit(&svp->sv_lock);
2799                 } else
2800                         nfs_rw_exit(&svp->sv_lock);
2801         }
2802
2803         mfname = mi->mi_fname;
2804         fn_hold(mfname);
2805         vp = makenfs4node_by_fh(mi->mi_rootfh, NULL, &mfname, NULL,
2806             VFTOMI4(vfsp), CRED(), gethrtime());
2807
2808         if (VTOR4(vp)->r_flags & R4STALE) {
2809                 VN_RELE(vp);
2810                 return (ENOENT);
2811         }
2812
2813         ASSERT(vp->v_type == VNON || vp->v_type == mi->mi_type);
2814
2815         vp->v_type = mi->mi_type;
2816
2817         *vpp = vp;
2818
2819         return (0);
2820 }
2821
2822 static int
2823 nfs4_statfs_otw(vnode_t *vp, struct statvfs64 *sbp, cred_t *cr)
2824 {
2825         int error;
2826         nfs4_ga_res_t gar;
2827         nfs4_ga_ext_res_t ger;
2828
2829         gar.n4g_ext_res = &ger;
2830
2831         if (error = nfs4_attr_otw(vp, TAG_FSINFO, &gar,
2832             NFS4_STATFS_ATTR_MASK, cr))
2833                 return (error);
2834
2835         *sbp = gar.n4g_ext_res->n4g_sb;
2836
2837         return (0);
2838 }
2839
2840 /*
2841  * Get file system statistics.
2842  */
2843 static int
2844 nfs4_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
2845 {
2846         int error;
2847         vnode_t *vp;
2848         cred_t *cr;
2849
2850         error = nfs4_root(vfsp, &vp);
2851         if (error)
2852                 return (error);
2853
2854         cr = CRED();
2855
2856         error = nfs4_statfs_otw(vp, sbp, cr);
2857         if (!error) {
2858                 (void) strncpy(sbp->f_basetype,
2859                     vfssw[vfsp->vfs_fstype].vsw_name, FSTYPSZ);
2860                 sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
2861         } else {
2862                 nfs4_purge_stale_fh(error, vp, cr);
2863         }
2864
2865         VN_RELE(vp);
2866
2867         return (error);
2868 }
2869
2870 static kmutex_t nfs4_syncbusy;
2871
2872 /*
2873  * Flush dirty nfs files for file system vfsp.
2874  * If vfsp == NULL, all nfs files are flushed.
2875  *
2876  * SYNC_CLOSE in flag is passed to us to
2877  * indicate that we are shutting down and or
2878  * rebooting.
2879  */
2880 static int
2881 nfs4_sync(vfs_t *vfsp, short flag, cred_t *cr)
2882 {
2883         /*
2884          * Cross-zone calls are OK here, since this translates to a
2885          * fop_putpage(B_ASYNC), which gets picked up by the right zone.
2886          */
2887         if (!(flag & SYNC_ATTR) && mutex_tryenter(&nfs4_syncbusy) != 0) {
2888                 r4flush(vfsp, cr);
2889                 mutex_exit(&nfs4_syncbusy);
2890         }
2891
2892         /*
2893          * if SYNC_CLOSE is set then we know that
2894          * the system is rebooting, mark the mntinfo
2895          * for later examination.
2896          */
2897         if (vfsp && (flag & SYNC_CLOSE)) {
2898                 mntinfo4_t *mi;
2899
2900                 mi = VFTOMI4(vfsp);
2901                 if (!(mi->mi_flags & MI4_SHUTDOWN)) {
2902                         mutex_enter(&mi->mi_lock);
2903                         mi->mi_flags |= MI4_SHUTDOWN;
2904                         mutex_exit(&mi->mi_lock);
2905                 }
2906         }
2907         return (0);
2908 }
2909
2910 /*
2911  * vget is difficult, if not impossible, to support in v4 because we don't
2912  * know the parent directory or name, which makes it impossible to create a
2913  * useful shadow vnode.  And we need the shadow vnode for things like
2914  * OPEN.
2915  */
2916
2917 /* ARGSUSED */
2918 /*
2919  * XXX Check nfs4_vget_pseudo() for dependency.
2920  */
2921 static int
2922 nfs4_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
2923 {
2924         return (EREMOTE);
2925 }
2926
2927 /*
2928  * nfs4_mountroot get called in the case where we are diskless booting.  All
2929  * we need from here is the ability to get the server info and from there we
2930  * can simply call nfs4_rootvp.
2931  */
2932 /* ARGSUSED */
2933 static int
2934 nfs4_mountroot(vfs_t *vfsp, whymountroot_t why)
2935 {
2936         vnode_t *rtvp;
2937         char root_hostname[SYS_NMLN+1];
2938         struct servinfo4 *svp;
2939         int error;
2940         int vfsflags;
2941         size_t size;
2942         char *root_path;
2943         struct pathname pn;
2944         char *name;
2945         cred_t *cr;
2946         mntinfo4_t *mi;
2947         struct nfs_args args;           /* nfs mount arguments */
2948         static char token[10];
2949         nfs4_error_t n4e;
2950
2951         bzero(&args, sizeof (args));
2952
2953         /* do this BEFORE getfile which causes xid stamps to be initialized */
2954         clkset(-1L);            /* hack for now - until we get time svc? */
2955
2956         if (why == ROOT_REMOUNT) {
2957                 /*
2958                  * Shouldn't happen.
2959                  */
2960                 panic("nfs4_mountroot: why == ROOT_REMOUNT");
2961         }
2962
2963         if (why == ROOT_UNMOUNT) {
2964                 /*
2965                  * Nothing to do for NFS.
2966                  */
2967                 return (0);
2968         }
2969
2970         /*
2971          * why == ROOT_INIT
2972          */
2973
2974         name = token;
2975         *name = 0;
2976         (void) getfsname("root", name, sizeof (token));
2977
2978         pn_alloc(&pn);
2979         root_path = pn.pn_path;
2980
2981         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2982         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2983         svp->sv_knconf = kmem_zalloc(sizeof (*svp->sv_knconf), KM_SLEEP);
2984         svp->sv_knconf->knc_protofmly = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2985         svp->sv_knconf->knc_proto = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2986
2987         /*
2988          * Get server address
2989          * Get the root path
2990          * Get server's transport
2991          * Get server's hostname
2992          * Get options
2993          */
2994         args.addr = &svp->sv_addr;
2995         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2996         args.fh = (char *)&svp->sv_fhandle;
2997         args.knconf = svp->sv_knconf;
2998         args.hostname = root_hostname;
2999         vfsflags = 0;
3000         if (error = mount_root(*name ? name : "root", root_path, NFS_V4,
3001             &args, &vfsflags)) {
3002                 if (error == EPROTONOSUPPORT)
3003                         nfs_cmn_err(error, CE_WARN, "nfs4_mountroot: "
3004                             "mount_root failed: server doesn't support NFS V4");
3005                 else
3006                         nfs_cmn_err(error, CE_WARN,
3007                             "nfs4_mountroot: mount_root failed: %m");
3008                 nfs_rw_exit(&svp->sv_lock);
3009                 sv4_free(svp);
3010                 pn_free(&pn);
3011                 return (error);
3012         }
3013         nfs_rw_exit(&svp->sv_lock);
3014         svp->sv_hostnamelen = (int)(strlen(root_hostname) + 1);
3015         svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
3016         (void) strcpy(svp->sv_hostname, root_hostname);
3017
3018         svp->sv_pathlen = (int)(strlen(root_path) + 1);
3019         svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
3020         (void) strcpy(svp->sv_path, root_path);
3021
3022         /*
3023          * Force root partition to always be mounted with AUTH_UNIX for now
3024          */
3025         svp->sv_secdata = kmem_alloc(sizeof (*svp->sv_secdata), KM_SLEEP);
3026         svp->sv_secdata->secmod = AUTH_UNIX;
3027         svp->sv_secdata->rpcflavor = AUTH_UNIX;
3028         svp->sv_secdata->data = NULL;
3029
3030         cr = crgetcred();
3031         rtvp = NULL;
3032
3033         error = nfs4rootvp(&rtvp, vfsp, svp, args.flags, cr, global_zone);
3034
3035         if (error) {
3036                 crfree(cr);
3037                 pn_free(&pn);
3038                 sv4_free(svp);
3039                 return (error);
3040         }
3041
3042         mi = VTOMI4(rtvp);
3043
3044         /*
3045          * Send client id to the server, if necessary
3046          */
3047         nfs4_error_zinit(&n4e);
3048         nfs4setclientid(mi, cr, FALSE, &n4e);
3049         error = n4e.error;
3050
3051         crfree(cr);
3052
3053         if (error) {
3054                 pn_free(&pn);
3055                 goto errout;
3056         }
3057
3058         error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, &args);
3059         if (error) {
3060                 nfs_cmn_err(error, CE_WARN,
3061                     "nfs4_mountroot: invalid root mount options");
3062                 pn_free(&pn);
3063                 goto errout;
3064         }
3065
3066         (void) vfs_lock_wait(vfsp);
3067         vfs_add(NULL, vfsp, vfsflags);
3068         vfs_unlock(vfsp);
3069
3070         size = strlen(svp->sv_hostname);
3071         (void) strcpy(rootfs.bo_name, svp->sv_hostname);
3072         rootfs.bo_name[size] = ':';
3073         (void) strcpy(&rootfs.bo_name[size + 1], root_path);
3074
3075         pn_free(&pn);
3076
3077 errout:
3078         if (error) {
3079                 sv4_free(svp);
3080                 nfs4_async_stop(vfsp);
3081                 nfs4_async_manager_stop(vfsp);
3082         }
3083
3084         if (rtvp != NULL)
3085                 VN_RELE(rtvp);
3086
3087         return (error);
3088 }
3089
3090 /*
3091  * Initialization routine for VFS routines.  Should only be called once
3092  */
3093 int
3094 nfs4_vfsinit(void)
3095 {
3096         mutex_init(&nfs4_syncbusy, NULL, MUTEX_DEFAULT, NULL);
3097         nfs4setclientid_init();
3098         nfs4_ephemeral_init();
3099         return (0);
3100 }
3101
3102 void
3103 nfs4_vfsfini(void)
3104 {
3105         nfs4_ephemeral_fini();
3106         nfs4setclientid_fini();
3107         mutex_destroy(&nfs4_syncbusy);
3108 }
3109
3110 void
3111 nfs4_freevfs(vfs_t *vfsp)
3112 {
3113         mntinfo4_t *mi;
3114
3115         /* need to release the initial hold */
3116         mi = VFTOMI4(vfsp);
3117
3118         /*
3119          * At this point, we can no longer reference the vfs
3120          * and need to inform other holders of the reference
3121          * to the mntinfo4_t.
3122          */
3123         mi->mi_vfsp = NULL;
3124
3125         MI4_RELE(mi);
3126 }
3127
3128 /*
3129  * Client side SETCLIENTID and SETCLIENTID_CONFIRM
3130  */
3131 struct nfs4_server nfs4_server_lst =
3132         { &nfs4_server_lst, &nfs4_server_lst };
3133
3134 kmutex_t nfs4_server_lst_lock;
3135
3136 static void
3137 nfs4setclientid_init(void)
3138 {
3139         mutex_init(&nfs4_server_lst_lock, NULL, MUTEX_DEFAULT, NULL);
3140 }
3141
3142 static void
3143 nfs4setclientid_fini(void)
3144 {
3145         mutex_destroy(&nfs4_server_lst_lock);
3146 }
3147
3148 int nfs4_retry_sclid_delay = NFS4_RETRY_SCLID_DELAY;
3149 int nfs4_num_sclid_retries = NFS4_NUM_SCLID_RETRIES;
3150
3151 /*
3152  * Set the clientid for the server for "mi".  No-op if the clientid is
3153  * already set.
3154  *
3155  * The recovery boolean should be set to TRUE if this function was called
3156  * by the recovery code, and FALSE otherwise.  This is used to determine
3157  * if we need to call nfs4_start/end_op as well as grab the mi_recovlock
3158  * for adding a mntinfo4_t to a nfs4_server_t.
3159  *
3160  * Error is returned via 'n4ep'.  If there was a 'n4ep->stat' error, then
3161  * 'n4ep->error' is set to geterrno4(n4ep->stat).
3162  */
3163 void
3164 nfs4setclientid(mntinfo4_t *mi, cred_t *cr, bool_t recovery, nfs4_error_t *n4ep)
3165 {
3166         struct nfs4_server *np;
3167         struct servinfo4 *svp = mi->mi_curr_serv;
3168         nfs4_recov_state_t recov_state;
3169         int num_retries = 0;
3170         bool_t retry;
3171         cred_t *lcr = NULL;
3172         int retry_inuse = 1; /* only retry once on NFS4ERR_CLID_INUSE */
3173         time_t lease_time = 0;
3174
3175         recov_state.rs_flags = 0;
3176         recov_state.rs_num_retry_despite_err = 0;
3177         ASSERT(n4ep != NULL);
3178
3179 recov_retry:
3180         retry = FALSE;
3181         nfs4_error_zinit(n4ep);
3182         if (!recovery)
3183                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3184
3185         mutex_enter(&nfs4_server_lst_lock);
3186         np = servinfo4_to_nfs4_server(svp); /* This locks np if it is found */
3187         mutex_exit(&nfs4_server_lst_lock);
3188         if (!np) {
3189                 struct nfs4_server *tnp;
3190                 np = new_nfs4_server(svp, cr);
3191                 mutex_enter(&np->s_lock);
3192
3193                 mutex_enter(&nfs4_server_lst_lock);
3194                 tnp = servinfo4_to_nfs4_server(svp);
3195                 if (tnp) {
3196                         /*
3197                          * another thread snuck in and put server on list.
3198                          * since we aren't adding it to the nfs4_server_list
3199                          * we need to set the ref count to 0 and destroy it.
3200                          */
3201                         np->s_refcnt = 0;
3202                         destroy_nfs4_server(np);
3203                         np = tnp;
3204                 } else {
3205                         /*
3206                          * do not give list a reference until everything
3207                          * succeeds
3208                          */
3209                         insque(np, &nfs4_server_lst);
3210                 }
3211                 mutex_exit(&nfs4_server_lst_lock);
3212         }
3213         ASSERT(MUTEX_HELD(&np->s_lock));
3214         /*
3215          * If we find the server already has N4S_CLIENTID_SET, then
3216          * just return, we've already done SETCLIENTID to that server
3217          */
3218         if (np->s_flags & N4S_CLIENTID_SET) {
3219                 /* add mi to np's mntinfo4_list */
3220                 nfs4_add_mi_to_server(np, mi);
3221                 if (!recovery)
3222                         nfs_rw_exit(&mi->mi_recovlock);
3223                 mutex_exit(&np->s_lock);
3224                 nfs4_server_rele(np);
3225                 return;
3226         }
3227         mutex_exit(&np->s_lock);
3228
3229
3230         /*
3231          * Drop the mi_recovlock since nfs4_start_op will
3232          * acquire it again for us.
3233          */
3234         if (!recovery) {
3235                 nfs_rw_exit(&mi->mi_recovlock);
3236
3237                 n4ep->error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3238                 if (n4ep->error) {
3239                         nfs4_server_rele(np);
3240                         return;
3241                 }
3242         }
3243
3244         mutex_enter(&np->s_lock);
3245         while (np->s_flags & N4S_CLIENTID_PEND) {
3246                 if (!cv_wait_sig(&np->s_clientid_pend, &np->s_lock)) {
3247                         mutex_exit(&np->s_lock);
3248                         nfs4_server_rele(np);
3249                         if (!recovery)
3250                                 nfs4_end_op(mi, NULL, NULL, &recov_state,
3251                                     recovery);
3252                         n4ep->error = EINTR;
3253                         return;
3254                 }
3255         }
3256
3257         if (np->s_flags & N4S_CLIENTID_SET) {
3258                 /* XXX copied/pasted from above */
3259                 /* add mi to np's mntinfo4_list */
3260                 nfs4_add_mi_to_server(np, mi);
3261                 mutex_exit(&np->s_lock);
3262                 nfs4_server_rele(np);
3263                 if (!recovery)
3264                         nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3265                 return;
3266         }
3267
3268         /*
3269          * Reset the N4S_CB_PINGED flag. This is used to
3270          * indicate if we have received a CB_NULL from the
3271          * server. Also we reset the waiter flag.
3272          */
3273         np->s_flags &= ~(N4S_CB_PINGED | N4S_CB_WAITER);
3274         /* any failure must now clear this flag */
3275         np->s_flags |= N4S_CLIENTID_PEND;
3276         mutex_exit(&np->s_lock);
3277         nfs4setclientid_otw(mi, svp, cr, np, n4ep, &retry_inuse);
3278
3279         if (n4ep->error == EACCES) {
3280                 /*
3281                  * If the uid is set then set the creds for secure mounts
3282                  * by proxy processes such as automountd.
3283                  */
3284                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3285                 if (svp->sv_secdata->uid != 0) {
3286                         lcr = crdup(cr);
3287                         (void) crsetugid(lcr, svp->sv_secdata->uid,
3288                             crgetgid(cr));
3289                 }
3290                 nfs_rw_exit(&svp->sv_lock);
3291
3292                 if (lcr != NULL) {
3293                         mutex_enter(&np->s_lock);
3294                         crfree(np->s_cred);
3295                         np->s_cred = lcr;
3296                         mutex_exit(&np->s_lock);
3297                         nfs4setclientid_otw(mi, svp, lcr, np, n4ep,
3298                             &retry_inuse);
3299                 }
3300         }
3301         mutex_enter(&np->s_lock);
3302         lease_time = np->s_lease_time;
3303         np->s_flags &= ~N4S_CLIENTID_PEND;
3304         mutex_exit(&np->s_lock);
3305
3306         if (n4ep->error != 0 || n4ep->stat != NFS4_OK) {
3307                 /*
3308                  * Start recovery if failover is a possibility.  If
3309                  * invoked by the recovery thread itself, then just
3310                  * return and let it handle the failover first.  NB:
3311                  * recovery is not allowed if the mount is in progress
3312                  * since the infrastructure is not sufficiently setup
3313                  * to allow it.  Just return the error (after suitable
3314                  * retries).
3315                  */
3316                 if (FAILOVER_MOUNT4(mi) && nfs4_try_failover(n4ep)) {
3317                         (void) nfs4_start_recovery(n4ep, mi, NULL,
3318                             NULL, NULL, NULL, OP_SETCLIENTID, NULL, NULL, NULL);
3319                         /*
3320                          * Don't retry here, just return and let
3321                          * recovery take over.
3322                          */
3323                         if (recovery)
3324                                 retry = FALSE;
3325                 } else if (nfs4_rpc_retry_error(n4ep->error) ||
3326                     n4ep->stat == NFS4ERR_RESOURCE ||
3327                     n4ep->stat == NFS4ERR_STALE_CLIENTID) {
3328
3329                         retry = TRUE;
3330                         /*
3331                          * Always retry if in recovery or once had
3332                          * contact with the server (but now it's
3333                          * overloaded).
3334                          */
3335                         if (recovery == TRUE ||
3336                             n4ep->error == ETIMEDOUT ||
3337                             n4ep->error == ECONNRESET)
3338                                 num_retries = 0;
3339                 } else if (retry_inuse && n4ep->error == 0 &&
3340                     n4ep->stat == NFS4ERR_CLID_INUSE) {
3341                         retry = TRUE;
3342                         num_retries = 0;
3343                 }
3344         } else {
3345                 /*
3346                  * Since everything succeeded give the list a reference count if
3347                  * it hasn't been given one by add_new_nfs4_server() or if this
3348                  * is not a recovery situation in which case it is already on
3349                  * the list.
3350                  */
3351                 mutex_enter(&np->s_lock);
3352                 if ((np->s_flags & N4S_INSERTED) == 0) {
3353                         np->s_refcnt++;
3354                         np->s_flags |= N4S_INSERTED;
3355                 }
3356                 mutex_exit(&np->s_lock);
3357         }
3358
3359         if (!recovery)
3360                 nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3361
3362
3363         if (retry && num_retries++ < nfs4_num_sclid_retries) {
3364                 if (retry_inuse) {
3365                         delay(SEC_TO_TICK(lease_time + nfs4_retry_sclid_delay));
3366                         retry_inuse = 0;
3367                 } else
3368                         delay(SEC_TO_TICK(nfs4_retry_sclid_delay));
3369
3370                 nfs4_server_rele(np);
3371                 goto recov_retry;
3372         }
3373
3374
3375         if (n4ep->error == 0)
3376                 n4ep->error = geterrno4(n4ep->stat);
3377
3378         /* broadcast before release in case no other threads are waiting */
3379         cv_broadcast(&np->s_clientid_pend);
3380         nfs4_server_rele(np);
3381 }
3382
3383 int nfs4setclientid_otw_debug = 0;
3384
3385 /*
3386  * This function handles the recovery of STALE_CLIENTID for SETCLIENTID_CONFRIM,
3387  * but nothing else; the calling function must be designed to handle those
3388  * other errors.
3389  */
3390 static void
3391 nfs4setclientid_otw(mntinfo4_t *mi, struct servinfo4 *svp,  cred_t *cr,
3392     struct nfs4_server *np, nfs4_error_t *ep, int *retry_inusep)
3393 {
3394         COMPOUND4args_clnt args;
3395         COMPOUND4res_clnt res;
3396         nfs_argop4 argop[3];
3397         SETCLIENTID4args *s_args;
3398         SETCLIENTID4resok *s_resok;
3399         int doqueue = 1;
3400         nfs4_ga_res_t *garp = NULL;
3401         timespec_t prop_time, after_time;
3402         verifier4 verf;
3403         clientid4 tmp_clientid;
3404
3405         ASSERT(!MUTEX_HELD(&np->s_lock));
3406
3407         args.ctag = TAG_SETCLIENTID;
3408
3409         args.array = argop;
3410         args.array_len = 3;
3411
3412         /* PUTROOTFH */
3413         argop[0].argop = OP_PUTROOTFH;
3414
3415         /* GETATTR */
3416         argop[1].argop = OP_GETATTR;
3417         argop[1].nfs_argop4_u.opgetattr.attr_request = FATTR4_LEASE_TIME_MASK;
3418         argop[1].nfs_argop4_u.opgetattr.mi = mi;
3419
3420         /* SETCLIENTID */
3421         argop[2].argop = OP_SETCLIENTID;
3422
3423         s_args = &argop[2].nfs_argop4_u.opsetclientid;
3424
3425         mutex_enter(&np->s_lock);
3426
3427         s_args->client.verifier = np->clidtosend.verifier;
3428         s_args->client.id_len = np->clidtosend.id_len;
3429         ASSERT(s_args->client.id_len <= NFS4_OPAQUE_LIMIT);
3430         s_args->client.id_val = np->clidtosend.id_val;
3431
3432         /*
3433          * Callback needs to happen on non-RDMA transport
3434          * Check if we have saved the original knetconfig
3435          * if so, use that instead.
3436          */
3437         if (svp->sv_origknconf != NULL)
3438                 nfs4_cb_args(np, svp->sv_origknconf, s_args);
3439         else
3440                 nfs4_cb_args(np, svp->sv_knconf, s_args);
3441
3442         mutex_exit(&np->s_lock);
3443
3444         rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
3445
3446         if (ep->error)
3447                 return;
3448
3449         /* getattr lease_time res */
3450         if ((res.array_len >= 2) &&
3451             (res.array[1].nfs_resop4_u.opgetattr.status == NFS4_OK)) {
3452                 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
3453
3454 #ifndef _LP64
3455                 /*
3456                  * The 32 bit client cannot handle a lease time greater than
3457                  * (INT32_MAX/1000000).  This is due to the use of the
3458                  * lease_time in calls to drv_usectohz() in
3459                  * nfs4_renew_lease_thread().  The problem is that
3460                  * drv_usectohz() takes a time_t (which is just a long = 4
3461                  * bytes) as its parameter.  The lease_time is multiplied by
3462                  * 1000000 to convert seconds to usecs for the parameter.  If
3463                  * a number bigger than (INT32_MAX/1000000) is used then we
3464                  * overflow on the 32bit client.
3465                  */
3466                 if (garp->n4g_ext_res->n4g_leasetime > (INT32_MAX/1000000)) {
3467                         garp->n4g_ext_res->n4g_leasetime = INT32_MAX/1000000;
3468                 }
3469 #endif
3470
3471                 mutex_enter(&np->s_lock);
3472                 np->s_lease_time = garp->n4g_ext_res->n4g_leasetime;
3473
3474                 /*
3475                  * Keep track of the lease period for the mi's
3476                  * mi_msg_list.  We need an appropiate time
3477                  * bound to associate past facts with a current
3478                  * event.  The lease period is perfect for this.
3479                  */
3480                 mutex_enter(&mi->mi_msg_list_lock);
3481                 mi->mi_lease_period = np->s_lease_time;
3482                 mutex_exit(&mi->mi_msg_list_lock);
3483                 mutex_exit(&np->s_lock);
3484         }
3485
3486
3487         if (res.status == NFS4ERR_CLID_INUSE) {
3488                 clientaddr4 *clid_inuse;
3489
3490                 if (!(*retry_inusep)) {
3491                         clid_inuse = &res.array->nfs_resop4_u.
3492                             opsetclientid.SETCLIENTID4res_u.client_using;
3493
3494                         zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3495                             "NFS4 mount (SETCLIENTID failed)."
3496                             "  nfs4_client_id.id is in"
3497                             "use already by: r_netid<%s> r_addr<%s>",
3498                             clid_inuse->r_netid, clid_inuse->r_addr);
3499                 }
3500
3501                 /*
3502                  * XXX - The client should be more robust in its
3503                  * handling of clientid in use errors (regen another
3504                  * clientid and try again?)
3505                  */
3506                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3507                 return;
3508         }
3509
3510         if (res.status) {
3511                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3512                 return;
3513         }
3514
3515         s_resok = &res.array[2].nfs_resop4_u.
3516             opsetclientid.SETCLIENTID4res_u.resok4;
3517
3518         tmp_clientid = s_resok->clientid;
3519
3520         verf = s_resok->setclientid_confirm;
3521
3522 #ifdef  DEBUG
3523         if (nfs4setclientid_otw_debug) {
3524                 union {
3525                         clientid4       clientid;
3526                         int             foo[2];
3527                 } cid;
3528
3529                 cid.clientid = s_resok->clientid;
3530
3531                 zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3532                 "nfs4setclientid_otw: OK, clientid = %x,%x, "
3533                 "verifier = %" PRIx64 "\n", cid.foo[0], cid.foo[1], verf);
3534         }
3535 #endif
3536
3537         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3538
3539         /* Confirm the client id and get the lease_time attribute */
3540
3541         args.ctag = TAG_SETCLIENTID_CF;
3542
3543         args.array = argop;
3544         args.array_len = 1;
3545
3546         argop[0].argop = OP_SETCLIENTID_CONFIRM;
3547
3548         argop[0].nfs_argop4_u.opsetclientid_confirm.clientid = tmp_clientid;
3549         argop[0].nfs_argop4_u.opsetclientid_confirm.setclientid_confirm = verf;
3550
3551         /* used to figure out RTT for np */
3552         gethrestime(&prop_time);
3553
3554         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlientid_otw: "
3555             "start time: %ld sec %ld nsec", prop_time.tv_sec,
3556             prop_time.tv_nsec));
3557
3558         rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
3559
3560         gethrestime(&after_time);
3561         mutex_enter(&np->s_lock);
3562         np->propagation_delay.tv_sec =
3563             MAX(1, after_time.tv_sec - prop_time.tv_sec);
3564         mutex_exit(&np->s_lock);
3565
3566         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlcientid_otw: "
3567             "finish time: %ld sec ", after_time.tv_sec));
3568
3569         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setclientid_otw: "
3570             "propagation delay set to %ld sec",
3571             np->propagation_delay.tv_sec));
3572
3573         if (ep->error)
3574                 return;
3575
3576         if (res.status == NFS4ERR_CLID_INUSE) {
3577                 clientaddr4 *clid_inuse;
3578
3579                 if (!(*retry_inusep)) {
3580                         clid_inuse = &res.array->nfs_resop4_u.
3581                             opsetclientid.SETCLIENTID4res_u.client_using;
3582
3583                         zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3584                             "SETCLIENTID_CONFIRM failed.  "
3585                             "nfs4_client_id.id is in use already by: "
3586                             "r_netid<%s> r_addr<%s>",
3587                             clid_inuse->r_netid, clid_inuse->r_addr);
3588                 }
3589
3590                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3591                 return;
3592         }
3593
3594         if (res.status) {
3595                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3596                 return;
3597         }
3598
3599         mutex_enter(&np->s_lock);
3600         np->clientid = tmp_clientid;
3601         np->s_flags |= N4S_CLIENTID_SET;
3602
3603         /* Add mi to np's mntinfo4 list */
3604         nfs4_add_mi_to_server(np, mi);
3605
3606         if (np->lease_valid == NFS4_LEASE_NOT_STARTED) {
3607                 /*
3608                  * Start lease management thread.
3609                  * Keep trying until we succeed.
3610                  */
3611
3612                 np->s_refcnt++;         /* pass reference to thread */
3613                 (void) zthread_create(NULL, 0, nfs4_renew_lease_thread, np, 0,
3614                     minclsyspri);
3615         }
3616         mutex_exit(&np->s_lock);
3617
3618         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3619 }
3620
3621 /*
3622  * Add mi to sp's mntinfo4_list if it isn't already in the list.  Makes
3623  * mi's clientid the same as sp's.
3624  * Assumes sp is locked down.
3625  */
3626 void
3627 nfs4_add_mi_to_server(nfs4_server_t *sp, mntinfo4_t *mi)
3628 {
3629         mntinfo4_t *tmi;
3630         int in_list = 0;
3631
3632         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
3633             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3634         ASSERT(sp != &nfs4_server_lst);
3635         ASSERT(MUTEX_HELD(&sp->s_lock));
3636
3637         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3638             "nfs4_add_mi_to_server: add mi %p to sp %p",
3639             (void*)mi, (void*)sp));
3640
3641         for (tmi = sp->mntinfo4_list;
3642             tmi != NULL;
3643             tmi = tmi->mi_clientid_next) {
3644                 if (tmi == mi) {
3645                         NFS4_DEBUG(nfs4_client_lease_debug,
3646                             (CE_NOTE,
3647                             "nfs4_add_mi_to_server: mi in list"));
3648                         in_list = 1;
3649                 }
3650         }
3651
3652         /*
3653          * First put a hold on the mntinfo4's vfsp so that references via
3654          * mntinfo4_list will be valid.
3655          */
3656         if (!in_list)
3657                 VFS_HOLD(mi->mi_vfsp);
3658
3659         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4_add_mi_to_server: "
3660             "hold vfs %p for mi: %p", (void*)mi->mi_vfsp, (void*)mi));
3661
3662         if (!in_list) {
3663                 if (sp->mntinfo4_list)
3664                         sp->mntinfo4_list->mi_clientid_prev = mi;
3665                 mi->mi_clientid_next = sp->mntinfo4_list;
3666                 mi->mi_srv = sp;
3667                 sp->mntinfo4_list = mi;
3668                 mi->mi_srvsettime = gethrestime_sec();
3669                 mi->mi_srvset_cnt++;
3670         }
3671
3672         /* set mi's clientid to that of sp's for later matching */
3673         mi->mi_clientid = sp->clientid;
3674
3675         /*
3676          * Update the clientid for any other mi's belonging to sp.  This
3677          * must be done here while we hold sp->s_lock, so that
3678          * find_nfs4_server() continues to work.
3679          */
3680
3681         for (tmi = sp->mntinfo4_list;
3682             tmi != NULL;
3683             tmi = tmi->mi_clientid_next) {
3684                 if (tmi != mi) {
3685                         tmi->mi_clientid = sp->clientid;
3686                 }
3687         }
3688 }
3689
3690 /*
3691  * Remove the mi from sp's mntinfo4_list and release its reference.
3692  * Exception: if mi still has open files, flag it for later removal (when
3693  * all the files are closed).
3694  *
3695  * If this is the last mntinfo4 in sp's list then tell the lease renewal
3696  * thread to exit.
3697  */
3698 static void
3699 nfs4_remove_mi_from_server_nolock(mntinfo4_t *mi, nfs4_server_t *sp)
3700 {
3701         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3702             "nfs4_remove_mi_from_server_nolock: remove mi %p from sp %p",
3703             (void*)mi, (void*)sp));
3704
3705         ASSERT(sp != NULL);
3706         ASSERT(MUTEX_HELD(&sp->s_lock));
3707         ASSERT(mi->mi_open_files >= 0);
3708
3709         /*
3710          * First make sure this mntinfo4 can be taken off of the list,
3711          * ie: it doesn't have any open files remaining.
3712          */
3713         if (mi->mi_open_files > 0) {
3714                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3715                     "nfs4_remove_mi_from_server_nolock: don't "
3716                     "remove mi since it still has files open"));
3717
3718                 mutex_enter(&mi->mi_lock);
3719                 mi->mi_flags |= MI4_REMOVE_ON_LAST_CLOSE;
3720                 mutex_exit(&mi->mi_lock);
3721                 return;
3722         }
3723
3724         VFS_HOLD(mi->mi_vfsp);
3725         remove_mi(sp, mi);
3726         VFS_RELE(mi->mi_vfsp);
3727
3728         if (sp->mntinfo4_list == NULL) {
3729                 /* last fs unmounted, kill the thread */
3730                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3731                     "remove_mi_from_nfs4_server_nolock: kill the thread"));
3732                 nfs4_mark_srv_dead(sp);
3733         }
3734 }
3735
3736 /*
3737  * Remove mi from sp's mntinfo4_list and release the vfs reference.
3738  */
3739 static void
3740 remove_mi(nfs4_server_t *sp, mntinfo4_t *mi)
3741 {
3742         ASSERT(MUTEX_HELD(&sp->s_lock));
3743
3744         /*
3745          * We release a reference, and the caller must still have a
3746          * reference.
3747          */
3748         ASSERT(mi->mi_vfsp->vfs_count >= 2);
3749
3750         if (mi->mi_clientid_prev) {
3751                 mi->mi_clientid_prev->mi_clientid_next = mi->mi_clientid_next;
3752         } else {
3753                 /* This is the first mi in sp's mntinfo4_list */
3754                 /*
3755                  * Make sure the first mntinfo4 in the list is the actual
3756                  * mntinfo4 passed in.
3757                  */
3758                 ASSERT(sp->mntinfo4_list == mi);
3759
3760                 sp->mntinfo4_list = mi->mi_clientid_next;
3761         }
3762         if (mi->mi_clientid_next)
3763                 mi->mi_clientid_next->mi_clientid_prev = mi->mi_clientid_prev;
3764
3765         /* Now mark the mntinfo4's links as being removed */
3766         mi->mi_clientid_prev = mi->mi_clientid_next = NULL;
3767         mi->mi_srv = NULL;
3768         mi->mi_srvset_cnt++;
3769
3770         VFS_RELE(mi->mi_vfsp);
3771 }
3772
3773 /*
3774  * Free all the entries in sp's mntinfo4_list.
3775  */
3776 static void
3777 remove_all_mi(nfs4_server_t *sp)
3778 {
3779         mntinfo4_t *mi;
3780
3781         ASSERT(MUTEX_HELD(&sp->s_lock));
3782
3783         while (sp->mntinfo4_list != NULL) {
3784                 mi = sp->mntinfo4_list;
3785                 /*
3786                  * Grab a reference in case there is only one left (which
3787                  * remove_mi() frees).
3788                  */
3789                 VFS_HOLD(mi->mi_vfsp);
3790                 remove_mi(sp, mi);
3791                 VFS_RELE(mi->mi_vfsp);
3792         }
3793 }
3794
3795 /*
3796  * Remove the mi from sp's mntinfo4_list as above, and rele the vfs.
3797  *
3798  * This version can be called with a null nfs4_server_t arg,
3799  * and will either find the right one and handle locking, or
3800  * do nothing because the mi wasn't added to an sp's mntinfo4_list.
3801  */
3802 void
3803 nfs4_remove_mi_from_server(mntinfo4_t *mi, nfs4_server_t *esp)
3804 {
3805         nfs4_server_t   *sp;
3806
3807         if (esp) {
3808                 nfs4_remove_mi_from_server_nolock(mi, esp);
3809                 return;
3810         }
3811
3812         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3813         if (sp = find_nfs4_server_all(mi, 1)) {
3814                 nfs4_remove_mi_from_server_nolock(mi, sp);
3815                 mutex_exit(&sp->s_lock);
3816                 nfs4_server_rele(sp);
3817         }
3818         nfs_rw_exit(&mi->mi_recovlock);
3819 }
3820
3821 /*
3822  * Return TRUE if the given server has any non-unmounted filesystems.
3823  */
3824
3825 bool_t
3826 nfs4_fs_active(nfs4_server_t *sp)
3827 {
3828         mntinfo4_t *mi;
3829
3830         ASSERT(MUTEX_HELD(&sp->s_lock));
3831
3832         for (mi = sp->mntinfo4_list; mi != NULL; mi = mi->mi_clientid_next) {
3833                 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
3834                         return (TRUE);
3835         }
3836
3837         return (FALSE);
3838 }
3839
3840 /*
3841  * Mark sp as finished and notify any waiters.
3842  */
3843
3844 void
3845 nfs4_mark_srv_dead(nfs4_server_t *sp)
3846 {
3847         ASSERT(MUTEX_HELD(&sp->s_lock));
3848
3849         sp->s_thread_exit = NFS4_THREAD_EXIT;
3850         cv_broadcast(&sp->cv_thread_exit);
3851 }
3852
3853 /*
3854  * Create a new nfs4_server_t structure.
3855  * Returns new node unlocked and not in list, but with a reference count of
3856  * 1.
3857  */
3858 struct nfs4_server *
3859 new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3860 {
3861         struct nfs4_server *np;
3862         timespec_t tt;
3863         union {
3864                 struct {
3865                         uint32_t sec;
3866                         uint32_t subsec;
3867                 } un_curtime;
3868                 verifier4       un_verifier;
3869         } nfs4clientid_verifier;
3870         /*
3871          * We change this ID string carefully and with the Solaris
3872          * NFS server behaviour in mind.  "+referrals" indicates
3873          * a client that can handle an NFSv4 referral.
3874          */
3875         char id_val[] = "Solaris: %s, NFSv4 kernel client +referrals";
3876         int len;
3877
3878         np = kmem_zalloc(sizeof (struct nfs4_server), KM_SLEEP);
3879         np->saddr.len = svp->sv_addr.len;
3880         np->saddr.maxlen = svp->sv_addr.maxlen;
3881         np->saddr.buf = kmem_alloc(svp->sv_addr.maxlen, KM_SLEEP);
3882         bcopy(svp->sv_addr.buf, np->saddr.buf, svp->sv_addr.len);
3883         np->s_refcnt = 1;
3884
3885         /*
3886          * Build the nfs_client_id4 for this server mount.  Ensure
3887          * the verifier is useful and that the identification is
3888          * somehow based on the server's address for the case of
3889          * multi-homed servers.
3890          */
3891         nfs4clientid_verifier.un_verifier = 0;
3892         gethrestime(&tt);
3893         nfs4clientid_verifier.un_curtime.sec = (uint32_t)tt.tv_sec;
3894         nfs4clientid_verifier.un_curtime.subsec = (uint32_t)tt.tv_nsec;
3895         np->clidtosend.verifier = nfs4clientid_verifier.un_verifier;
3896
3897         /*
3898          * calculate the length of the opaque identifier.  Subtract 2
3899          * for the "%s" and add the traditional +1 for null
3900          * termination.
3901          */
3902         len = strlen(id_val) - 2 + strlen(uts_nodename()) + 1;
3903         np->clidtosend.id_len = len + np->saddr.maxlen;
3904
3905         np->clidtosend.id_val = kmem_alloc(np->clidtosend.id_len, KM_SLEEP);
3906         (void) sprintf(np->clidtosend.id_val, id_val, uts_nodename());
3907         bcopy(np->saddr.buf, &np->clidtosend.id_val[len], np->saddr.len);
3908
3909         np->s_flags = 0;
3910         np->mntinfo4_list = NULL;
3911         /* save cred for issuing rfs4calls inside the renew thread */
3912         crhold(cr);
3913         np->s_cred = cr;
3914         cv_init(&np->cv_thread_exit, NULL, CV_DEFAULT, NULL);
3915         mutex_init(&np->s_lock, NULL, MUTEX_DEFAULT, NULL);
3916         nfs_rw_init(&np->s_recovlock, NULL, RW_DEFAULT, NULL);
3917         list_create(&np->s_deleg_list, sizeof (rnode4_t),
3918             offsetof(rnode4_t, r_deleg_link));
3919         np->s_thread_exit = 0;
3920         np->state_ref_count = 0;
3921         np->lease_valid = NFS4_LEASE_NOT_STARTED;
3922         cv_init(&np->s_cv_otw_count, NULL, CV_DEFAULT, NULL);
3923         cv_init(&np->s_clientid_pend, NULL, CV_DEFAULT, NULL);
3924         np->s_otw_call_count = 0;
3925         cv_init(&np->wait_cb_null, NULL, CV_DEFAULT, NULL);
3926         np->zoneid = getzoneid();
3927         np->zone_globals = nfs4_get_callback_globals();
3928         ASSERT(np->zone_globals != NULL);
3929         return (np);
3930 }
3931
3932 /*
3933  * Create a new nfs4_server_t structure and add it to the list.
3934  * Returns new node locked; reference must eventually be freed.
3935  */
3936 static struct nfs4_server *
3937 add_new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3938 {
3939         nfs4_server_t *sp;
3940
3941         ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
3942         sp = new_nfs4_server(svp, cr);
3943         mutex_enter(&sp->s_lock);
3944         insque(sp, &nfs4_server_lst);
3945         sp->s_refcnt++;                 /* list gets a reference */
3946         sp->s_flags |= N4S_INSERTED;
3947         sp->clientid = 0;
3948         return (sp);
3949 }
3950
3951 int nfs4_server_t_debug = 0;
3952
3953
3954 #ifdef DEBUG
3955 void
3956 dumpnfs4slist(char *txt, mntinfo4_t *mi, clientid4 clientid, servinfo4_t *srv_p)
3957 {
3958         int hash16(void *p, int len);
3959         nfs4_server_t *np;
3960
3961         NFS4_DEBUG(nfs4_server_t_debug, (CE_NOTE,
3962             "dumping nfs4_server_t list in %s", txt));
3963         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3964             "mi 0x%p, want clientid %llx, addr %d/%04X",
3965             mi, (longlong_t)clientid, srv_p->sv_addr.len,
3966             hash16((void *)srv_p->sv_addr.buf, srv_p->sv_addr.len)));
3967         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst;
3968             np = np->forw) {
3969                 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3970                     "node 0x%p,    clientid %llx, addr %d/%04X, cnt %d",
3971                     np, (longlong_t)np->clientid, np->saddr.len,
3972                     hash16((void *)np->saddr.buf, np->saddr.len),
3973                     np->state_ref_count));
3974                 if (np->saddr.len == srv_p->sv_addr.len &&
3975                     bcmp(np->saddr.buf, srv_p->sv_addr.buf,
3976                     np->saddr.len) == 0)
3977                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3978                             " - address matches"));
3979                 if (np->clientid == clientid || np->clientid == 0)
3980                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3981                             " - clientid matches"));
3982                 if (np->s_thread_exit != NFS4_THREAD_EXIT)
3983                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3984                             " - thread not exiting"));
3985         }
3986         delay(hz);
3987 }
3988 #endif
3989
3990
3991 /*
3992  * Move a mntinfo4_t from one server list to another.
3993  * Locking of the two nfs4_server_t nodes will be done in list order.
3994  *
3995  * Returns NULL if the current nfs4_server_t for the filesystem could not
3996  * be found (e.g., due to forced unmount).  Otherwise returns a reference
3997  * to the new nfs4_server_t, which must eventually be freed.
3998  */
3999 nfs4_server_t *
4000 nfs4_move_mi(mntinfo4_t *mi, servinfo4_t *old, servinfo4_t *new)
4001 {
4002         nfs4_server_t *p, *op = NULL, *np = NULL;
4003         int num_open;
4004         zoneid_t zoneid = nfs_zoneid();
4005
4006         ASSERT(nfs_zone() == mi->mi_zone);
4007
4008         mutex_enter(&nfs4_server_lst_lock);
4009 #ifdef DEBUG
4010         if (nfs4_server_t_debug)
4011                 dumpnfs4slist("nfs4_move_mi", mi, (clientid4)0, new);
4012 #endif
4013         for (p = nfs4_server_lst.forw; p != &nfs4_server_lst; p = p->forw) {
4014                 if (p->zoneid != zoneid)
4015                         continue;
4016                 if (p->saddr.len == old->sv_addr.len &&
4017                     bcmp(p->saddr.buf, old->sv_addr.buf, p->saddr.len) == 0 &&
4018                     p->s_thread_exit != NFS4_THREAD_EXIT) {
4019                         op = p;
4020                         mutex_enter(&op->s_lock);
4021                         op->s_refcnt++;
4022                 }
4023                 if (p->saddr.len == new->sv_addr.len &&
4024                     bcmp(p->saddr.buf, new->sv_addr.buf, p->saddr.len) == 0 &&
4025                     p->s_thread_exit != NFS4_THREAD_EXIT) {
4026                         np = p;
4027                         mutex_enter(&np->s_lock);
4028                 }
4029                 if (op != NULL && np != NULL)
4030                         break;
4031         }
4032         if (op == NULL) {
4033                 /*
4034                  * Filesystem has been forcibly unmounted.  Bail out.
4035                  */
4036                 if (np != NULL)
4037                         mutex_exit(&np->s_lock);
4038                 mutex_exit(&nfs4_server_lst_lock);
4039                 return (NULL);
4040         }
4041         if (np != NULL) {
4042                 np->s_refcnt++;
4043         } else {
4044 #ifdef DEBUG
4045                 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4046                     "nfs4_move_mi: no target nfs4_server, will create."));
4047 #endif
4048                 np = add_new_nfs4_server(new, kcred);
4049         }
4050         mutex_exit(&nfs4_server_lst_lock);
4051
4052         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4053             "nfs4_move_mi: for mi 0x%p, "
4054             "old servinfo4 0x%p, new servinfo4 0x%p, "
4055             "old nfs4_server 0x%p, new nfs4_server 0x%p, ",
4056             (void*)mi, (void*)old, (void*)new,
4057             (void*)op, (void*)np));
4058         ASSERT(op != NULL && np != NULL);
4059
4060         /* discard any delegations */
4061         nfs4_deleg_discard(mi, op);
4062
4063         num_open = mi->mi_open_files;
4064         mi->mi_open_files = 0;
4065         op->state_ref_count -= num_open;
4066         ASSERT(op->state_ref_count >= 0);
4067         np->state_ref_count += num_open;
4068         nfs4_remove_mi_from_server_nolock(mi, op);
4069         mi->mi_open_files = num_open;
4070         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4071             "nfs4_move_mi: mi_open_files %d, op->cnt %d, np->cnt %d",
4072             mi->mi_open_files, op->state_ref_count, np->state_ref_count));
4073
4074         nfs4_add_mi_to_server(np, mi);
4075
4076         mutex_exit(&op->s_lock);
4077         mutex_exit(&np->s_lock);
4078         nfs4_server_rele(op);
4079
4080         return (np);
4081 }
4082
4083 /*
4084  * Need to have the nfs4_server_lst_lock.
4085  * Search the nfs4_server list to find a match on this servinfo4
4086  * based on its address.
4087  *
4088  * Returns NULL if no match is found.  Otherwise returns a reference (which
4089  * must eventually be freed) to a locked nfs4_server.
4090  */
4091 nfs4_server_t *
4092 servinfo4_to_nfs4_server(servinfo4_t *srv_p)
4093 {
4094         nfs4_server_t *np;
4095         zoneid_t zoneid = nfs_zoneid();
4096
4097         ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
4098         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4099                 if (np->zoneid == zoneid &&
4100                     np->saddr.len == srv_p->sv_addr.len &&
4101                     bcmp(np->saddr.buf, srv_p->sv_addr.buf,
4102                     np->saddr.len) == 0 &&
4103                     np->s_thread_exit != NFS4_THREAD_EXIT) {
4104                         mutex_enter(&np->s_lock);
4105                         np->s_refcnt++;
4106                         return (np);
4107                 }
4108         }
4109         return (NULL);
4110 }
4111
4112 /*
4113  * Locks the nfs4_server down if it is found and returns a reference that
4114  * must eventually be freed.
4115  */
4116 static nfs4_server_t *
4117 lookup_nfs4_server(nfs4_server_t *sp, int any_state)
4118 {
4119         nfs4_server_t *np;
4120
4121         mutex_enter(&nfs4_server_lst_lock);
4122         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4123                 mutex_enter(&np->s_lock);
4124                 if (np == sp && np->s_refcnt > 0 &&
4125                     (np->s_thread_exit != NFS4_THREAD_EXIT || any_state)) {
4126                         mutex_exit(&nfs4_server_lst_lock);
4127                         np->s_refcnt++;
4128                         return (np);
4129                 }
4130                 mutex_exit(&np->s_lock);
4131         }
4132         mutex_exit(&nfs4_server_lst_lock);
4133
4134         return (NULL);
4135 }
4136
4137 /*
4138  * The caller should be holding mi->mi_recovlock, and it should continue to
4139  * hold the lock until done with the returned nfs4_server_t.  Once
4140  * mi->mi_recovlock is released, there is no guarantee that the returned
4141  * mi->nfs4_server_t will continue to correspond to mi.
4142  */
4143 nfs4_server_t *
4144 find_nfs4_server(mntinfo4_t *mi)
4145 {
4146         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4147             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4148
4149         return (lookup_nfs4_server(mi->mi_srv, 0));
4150 }
4151
4152 /*
4153  * Same as above, but takes an "any_state" parameter which can be
4154  * set to 1 if the caller wishes to find nfs4_server_t's which
4155  * have been marked for termination by the exit of the renew
4156  * thread.  This should only be used by operations which are
4157  * cleaning up and will not cause an OTW op.
4158  */
4159 nfs4_server_t *
4160 find_nfs4_server_all(mntinfo4_t *mi, int any_state)
4161 {
4162         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4163             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4164
4165         return (lookup_nfs4_server(mi->mi_srv, any_state));
4166 }
4167
4168 /*
4169  * Lock sp, but only if it's still active (in the list and hasn't been
4170  * flagged as exiting) or 'any_state' is non-zero.
4171  * Returns TRUE if sp got locked and adds a reference to sp.
4172  */
4173 bool_t
4174 nfs4_server_vlock(nfs4_server_t *sp, int any_state)
4175 {
4176         return (lookup_nfs4_server(sp, any_state) != NULL);
4177 }
4178
4179 /*
4180  * Release the reference to sp and destroy it if that's the last one.
4181  */
4182
4183 void
4184 nfs4_server_rele(nfs4_server_t *sp)
4185 {
4186         mutex_enter(&sp->s_lock);
4187         ASSERT(sp->s_refcnt > 0);
4188         sp->s_refcnt--;
4189         if (sp->s_refcnt > 0) {
4190                 mutex_exit(&sp->s_lock);
4191                 return;
4192         }
4193         mutex_exit(&sp->s_lock);
4194
4195         mutex_enter(&nfs4_server_lst_lock);
4196         mutex_enter(&sp->s_lock);
4197         if (sp->s_refcnt > 0) {
4198                 mutex_exit(&sp->s_lock);
4199                 mutex_exit(&nfs4_server_lst_lock);
4200                 return;
4201         }
4202         remque(sp);
4203         sp->forw = sp->back = NULL;
4204         mutex_exit(&nfs4_server_lst_lock);
4205         destroy_nfs4_server(sp);
4206 }
4207
4208 static void
4209 destroy_nfs4_server(nfs4_server_t *sp)
4210 {
4211         ASSERT(MUTEX_HELD(&sp->s_lock));
4212         ASSERT(sp->s_refcnt == 0);
4213         ASSERT(sp->s_otw_call_count == 0);
4214
4215         remove_all_mi(sp);
4216
4217         crfree(sp->s_cred);
4218         kmem_free(sp->saddr.buf, sp->saddr.maxlen);
4219         kmem_free(sp->clidtosend.id_val, sp->clidtosend.id_len);
4220         mutex_exit(&sp->s_lock);
4221
4222         /* destroy the nfs4_server */
4223         nfs4callback_destroy(sp);
4224         list_destroy(&sp->s_deleg_list);
4225         mutex_destroy(&sp->s_lock);
4226         cv_destroy(&sp->cv_thread_exit);
4227         cv_destroy(&sp->s_cv_otw_count);
4228         cv_destroy(&sp->s_clientid_pend);
4229         cv_destroy(&sp->wait_cb_null);
4230         nfs_rw_destroy(&sp->s_recovlock);
4231         kmem_free(sp, sizeof (*sp));
4232 }
4233
4234 /*
4235  * Fork off a thread to free the data structures for a mount.
4236  */
4237
4238 static void
4239 async_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4240 {
4241         freemountargs_t *args;
4242         args = kmem_alloc(sizeof (freemountargs_t), KM_SLEEP);
4243         args->fm_vfsp = vfsp;
4244         VFS_HOLD(vfsp);
4245         MI4_HOLD(VFTOMI4(vfsp));
4246         args->fm_flag = flag;
4247         args->fm_cr = cr;
4248         crhold(cr);
4249         (void) zthread_create(NULL, 0, nfs4_free_mount_thread, args, 0,
4250             minclsyspri);
4251 }
4252
4253 static void
4254 nfs4_free_mount_thread(freemountargs_t *args)
4255 {
4256         mntinfo4_t *mi;
4257         nfs4_free_mount(args->fm_vfsp, args->fm_flag, args->fm_cr);
4258         mi = VFTOMI4(args->fm_vfsp);
4259         crfree(args->fm_cr);
4260         VFS_RELE(args->fm_vfsp);
4261         MI4_RELE(mi);
4262         kmem_free(args, sizeof (freemountargs_t));
4263         zthread_exit();
4264         /* NOTREACHED */
4265 }
4266
4267 /*
4268  * Thread to free the data structures for a given filesystem.
4269  */
4270 static void
4271 nfs4_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4272 {
4273         mntinfo4_t              *mi = VFTOMI4(vfsp);
4274         nfs4_server_t           *sp;
4275         callb_cpr_t             cpr_info;
4276         kmutex_t                cpr_lock;
4277         boolean_t               async_thread;
4278         int                     removed;
4279
4280         bool_t                  must_unlock;
4281         nfs4_ephemeral_tree_t   *eph_tree;
4282
4283         /*
4284          * We need to participate in the CPR framework if this is a kernel
4285          * thread.
4286          */
4287         async_thread = (curproc == nfs_zone()->zone_zsched);
4288         if (async_thread) {
4289                 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
4290                 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
4291                     "nfsv4AsyncUnmount");
4292         }
4293
4294         /*
4295          * We need to wait for all outstanding OTW calls
4296          * and recovery to finish before we remove the mi
4297          * from the nfs4_server_t, as current pending
4298          * calls might still need this linkage (in order
4299          * to find a nfs4_server_t from a mntinfo4_t).
4300          */
4301         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
4302         sp = find_nfs4_server(mi);
4303         nfs_rw_exit(&mi->mi_recovlock);
4304
4305         if (sp) {
4306                 while (sp->s_otw_call_count != 0) {
4307                         if (async_thread) {
4308                                 mutex_enter(&cpr_lock);
4309                                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
4310                                 mutex_exit(&cpr_lock);
4311                         }
4312                         cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
4313                         if (async_thread) {
4314                                 mutex_enter(&cpr_lock);
4315                                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4316                                 mutex_exit(&cpr_lock);
4317                         }
4318                 }
4319                 mutex_exit(&sp->s_lock);
4320                 nfs4_server_rele(sp);
4321                 sp = NULL;
4322         }
4323
4324         mutex_enter(&mi->mi_lock);
4325         while (mi->mi_in_recovery != 0) {
4326                 if (async_thread) {
4327                         mutex_enter(&cpr_lock);
4328                         CALLB_CPR_SAFE_BEGIN(&cpr_info);
4329                         mutex_exit(&cpr_lock);
4330                 }
4331                 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
4332                 if (async_thread) {
4333                         mutex_enter(&cpr_lock);
4334                         CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4335                         mutex_exit(&cpr_lock);
4336                 }
4337         }
4338         mutex_exit(&mi->mi_lock);
4339
4340         /*
4341          * If we got an error, then do not nuke the
4342          * tree. Either the harvester is busy reclaiming
4343          * this node or we ran into some busy condition.
4344          *
4345          * The harvester will eventually come along and cleanup.
4346          * The only problem would be the root mount point.
4347          *
4348          * Since the busy node can occur for a variety
4349          * of reasons and can result in an entry staying
4350          * in df output but no longer accessible from the
4351          * directory tree, we are okay.
4352          */
4353         if (!nfs4_ephemeral_umount(mi, flag, cr,
4354             &must_unlock, &eph_tree))
4355                 nfs4_ephemeral_umount_activate(mi, &must_unlock,
4356                     &eph_tree);
4357
4358         /*
4359          * The original purge of the dnlc via 'dounmount'
4360          * doesn't guarantee that another dnlc entry was not
4361          * added while we waitied for all outstanding OTW
4362          * and recovery calls to finish.  So re-purge the
4363          * dnlc now.
4364          */
4365         (void) dnlc_purge_vfsp(vfsp, 0);
4366
4367         /*
4368          * We need to explicitly stop the manager thread; the asyc worker
4369          * threads can timeout and exit on their own.
4370          */
4371         mutex_enter(&mi->mi_async_lock);
4372         mi->mi_max_threads = 0;
4373         NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
4374         mutex_exit(&mi->mi_async_lock);
4375         if (mi->mi_manager_thread)
4376                 nfs4_async_manager_stop(vfsp);
4377
4378         destroy_rtable4(vfsp, cr);
4379
4380         nfs4_remove_mi_from_server(mi, NULL);
4381
4382         if (async_thread) {
4383                 mutex_enter(&cpr_lock);
4384                 CALLB_CPR_EXIT(&cpr_info);      /* drops cpr_lock */
4385                 mutex_destroy(&cpr_lock);
4386         }
4387
4388         removed = nfs4_mi_zonelist_remove(mi);
4389         if (removed)
4390                 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
4391 }
4392
4393 /* Referral related sub-routines */
4394
4395 /* Freeup knetconfig */
4396 static void
4397 free_knconf_contents(struct knetconfig *k)
4398 {
4399         if (k == NULL)
4400                 return;
4401         if (k->knc_protofmly)
4402                 kmem_free(k->knc_protofmly, KNC_STRSIZE);
4403         if (k->knc_proto)
4404                 kmem_free(k->knc_proto, KNC_STRSIZE);
4405 }
4406
4407 /*
4408  * This updates newpath variable with exact name component from the
4409  * path which gave us a NFS4ERR_MOVED error.
4410  * If the path is /rp/aaa/bbb and nth value is 1, aaa is returned.
4411  */
4412 static char *
4413 extract_referral_point(const char *svp, int nth)
4414 {
4415         int num_slashes = 0;
4416         const char *p;
4417         char *newpath = NULL;
4418         int i = 0;
4419
4420         newpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4421         for (p = svp; *p; p++) {
4422                 if (*p == '/')
4423                         num_slashes++;
4424                 if (num_slashes == nth + 1) {
4425                         p++;
4426                         while (*p != '/') {
4427                                 if (*p == '\0')
4428                                         break;
4429                                 newpath[i] = *p;
4430                                 i++;
4431                                 p++;
4432                         }
4433                         newpath[i++] = '\0';
4434                         break;
4435                 }
4436         }
4437         return (newpath);
4438 }
4439
4440 /*
4441  * This sets up a new path in sv_path to do a lookup of the referral point.
4442  * If the path is /rp/aaa/bbb and the referral point is aaa,
4443  * this updates /rp/aaa. This path will be used to get referral
4444  * location.
4445  */
4446 static void
4447 setup_newsvpath(servinfo4_t *svp, int nth)
4448 {
4449         int num_slashes = 0, pathlen, i = 0;
4450         char *newpath, *p;
4451
4452         newpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4453         for (p = svp->sv_path; *p; p++) {
4454                 newpath[i] =  *p;
4455                 if (*p == '/')
4456                         num_slashes++;
4457                 if (num_slashes == nth + 1) {
4458                         newpath[i] = '\0';
4459                         pathlen = strlen(newpath) + 1;
4460                         kmem_free(svp->sv_path, svp->sv_pathlen);
4461                         svp->sv_path = kmem_alloc(pathlen, KM_SLEEP);
4462                         svp->sv_pathlen = pathlen;
4463                         bcopy(newpath, svp->sv_path, pathlen);
4464                         break;
4465                 }
4466                 i++;
4467         }
4468         kmem_free(newpath, MAXPATHLEN);
4469 }