vm: replace vnode v_pages with a v_pagecache_list list_t
[unleashed.git] / kernel / fs / nfs / nfs4_vfsops.c
blob45ddb3fafbcf68441107f06d54c6c53a5cf58998
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
29 * All Rights Reserved
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/vfs.h>
37 #include <sys/vfs_opreg.h>
38 #include <sys/vnode.h>
39 #include <sys/pathname.h>
40 #include <sys/sysmacros.h>
41 #include <sys/kmem.h>
42 #include <sys/mkdev.h>
43 #include <sys/mount.h>
44 #include <sys/statvfs.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/utsname.h>
49 #include <sys/bootconf.h>
50 #include <sys/modctl.h>
51 #include <sys/acl.h>
52 #include <sys/flock.h>
53 #include <sys/time.h>
54 #include <sys/disp.h>
55 #include <sys/policy.h>
56 #include <sys/socket.h>
57 #include <sys/netconfig.h>
58 #include <sys/dnlc.h>
59 #include <sys/list.h>
60 #include <sys/mntent.h>
62 #include <rpc/types.h>
63 #include <rpc/auth.h>
64 #include <rpc/rpcsec_gss.h>
65 #include <rpc/clnt.h>
67 #include <nfs/nfs.h>
68 #include <nfs/nfs_clnt.h>
69 #include <nfs/mount.h>
70 #include <nfs/nfs_acl.h>
72 #include <sys/fs_subr.h>
74 #include <nfs/nfs4.h>
75 #include <nfs/rnode4.h>
76 #include <nfs/nfs4_clnt.h>
77 #include <sys/fs/autofs.h>
79 #include <sys/sdt.h>
83 * Arguments passed to thread to free data structures from forced unmount.
86 typedef struct {
87 vfs_t *fm_vfsp;
88 int fm_flag;
89 cred_t *fm_cr;
90 } freemountargs_t;
92 static void async_free_mount(vfs_t *, int, cred_t *);
93 static void nfs4_free_mount(vfs_t *, int, cred_t *);
94 static void nfs4_free_mount_thread(freemountargs_t *);
95 static int nfs4_chkdup_servinfo4(servinfo4_t *, servinfo4_t *);
98 * From rpcsec module (common/rpcsec).
100 extern int sec_clnt_loadinfo(struct sec_data *, struct sec_data **, model_t);
101 extern void sec_clnt_freeinfo(struct sec_data *);
104 * The order and contents of this structure must be kept in sync with that of
105 * rfsreqcnt_v4_tmpl in nfs_stats.c
107 static char *rfsnames_v4[] = {
108 "null", "compound", "reserved", "access", "close", "commit", "create",
109 "delegpurge", "delegreturn", "getattr", "getfh", "link", "lock",
110 "lockt", "locku", "lookup", "lookupp", "nverify", "open", "openattr",
111 "open_confirm", "open_downgrade", "putfh", "putpubfh", "putrootfh",
112 "read", "readdir", "readlink", "remove", "rename", "renew",
113 "restorefh", "savefh", "secinfo", "setattr", "setclientid",
114 "setclientid_confirm", "verify", "write"
118 * nfs4_max_mount_retry is the number of times the client will redrive
119 * a mount compound before giving up and returning failure. The intent
120 * is to redrive mount compounds which fail NFS4ERR_STALE so that
121 * if a component of the server path being mounted goes stale, it can
122 * "recover" by redriving the mount compund (LOOKUP ops). This recovery
123 * code is needed outside of the recovery framework because mount is a
124 * special case. The client doesn't create vnodes/rnodes for components
125 * of the server path being mounted. The recovery code recovers real
126 * client objects, not STALE FHs which map to components of the server
127 * path being mounted.
129 * We could just fail the mount on the first time, but that would
130 * instantly trigger failover (from nfs4_mount), and the client should
131 * try to re-lookup the STALE FH before doing failover. The easiest
132 * way to "re-lookup" is to simply redrive the mount compound.
134 static int nfs4_max_mount_retry = 2;
137 * nfs4 vfs operations.
139 int nfs4_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
140 static int nfs4_unmount(vfs_t *, int, cred_t *);
141 static int nfs4_root(vfs_t *, vnode_t **);
142 static int nfs4_statvfs(vfs_t *, struct statvfs64 *);
143 static int nfs4_sync(vfs_t *, short, cred_t *);
144 static int nfs4_vget(vfs_t *, vnode_t **, fid_t *);
145 static int nfs4_mountroot(vfs_t *, whymountroot_t);
146 static void nfs4_freevfs(vfs_t *);
148 static int nfs4rootvp(vnode_t **, vfs_t *, struct servinfo4 *,
149 int, cred_t *, zone_t *);
151 vfsops_t *nfs4_vfsops;
153 int nfs4_vfsinit(void);
154 void nfs4_vfsfini(void);
155 static void nfs4setclientid_init(void);
156 static void nfs4setclientid_fini(void);
157 static void nfs4setclientid_otw(mntinfo4_t *, servinfo4_t *, cred_t *,
158 struct nfs4_server *, nfs4_error_t *, int *);
159 static void destroy_nfs4_server(nfs4_server_t *);
160 static void remove_mi(nfs4_server_t *, mntinfo4_t *);
162 extern void nfs4_ephemeral_init(void);
163 extern void nfs4_ephemeral_fini(void);
165 /* referral related routines */
166 static servinfo4_t *copy_svp(servinfo4_t *);
167 static void free_knconf_contents(struct knetconfig *k);
168 static char *extract_referral_point(const char *, int);
169 static void setup_newsvpath(servinfo4_t *, int);
170 static void update_servinfo4(servinfo4_t *, fs_location4 *,
171 struct nfs_fsl_info *, char *, int);
174 * Initialize the vfs structure
177 static int nfs4fstyp;
181 * Debug variable to check for rdma based
182 * transport startup and cleanup. Controlled
183 * through /etc/system. Off by default.
185 extern int rdma_debug;
188 nfs4init(int fstyp, char *name)
190 static const fs_operation_def_t nfs4_vfsops_template[] = {
191 VFSNAME_MOUNT, { .vfs_mount = nfs4_mount },
192 VFSNAME_UNMOUNT, { .vfs_unmount = nfs4_unmount },
193 VFSNAME_ROOT, { .vfs_root = nfs4_root },
194 VFSNAME_STATVFS, { .vfs_statvfs = nfs4_statvfs },
195 VFSNAME_SYNC, { .vfs_sync = nfs4_sync },
196 VFSNAME_VGET, { .vfs_vget = nfs4_vget },
197 VFSNAME_MOUNTROOT, { .vfs_mountroot = nfs4_mountroot },
198 VFSNAME_FREEVFS, { .vfs_freevfs = nfs4_freevfs },
199 NULL, NULL
201 int error;
203 nfs4_vfsops = NULL;
204 nfs4_vnodeops = NULL;
205 nfs4_trigger_vnodeops = NULL;
207 error = vfs_setfsops(fstyp, nfs4_vfsops_template, &nfs4_vfsops);
208 if (error != 0) {
209 zcmn_err(GLOBAL_ZONEID, CE_WARN,
210 "nfs4init: bad vfs ops template");
211 goto out;
214 error = vn_make_ops(name, nfs4_vnodeops_template, &nfs4_vnodeops);
215 if (error != 0) {
216 zcmn_err(GLOBAL_ZONEID, CE_WARN,
217 "nfs4init: bad vnode ops template");
218 goto out;
221 error = vn_make_ops("nfs4_trigger", nfs4_trigger_vnodeops_template,
222 &nfs4_trigger_vnodeops);
223 if (error != 0) {
224 zcmn_err(GLOBAL_ZONEID, CE_WARN,
225 "nfs4init: bad trigger vnode ops template");
226 goto out;
229 nfs4fstyp = fstyp;
230 (void) nfs4_vfsinit();
231 (void) nfs4_init_dot_entries();
233 out:
234 if (error) {
235 if (nfs4_trigger_vnodeops != NULL)
236 vn_freevnodeops(nfs4_trigger_vnodeops);
238 if (nfs4_vnodeops != NULL)
239 vn_freevnodeops(nfs4_vnodeops);
241 (void) vfs_freevfsops_by_type(fstyp);
244 return (error);
247 void
248 nfs4fini(void)
250 (void) nfs4_destroy_dot_entries();
251 nfs4_vfsfini();
255 * Create a new sec_data structure to store AUTH_DH related data:
256 * netname, syncaddr, knetconfig. There is no AUTH_F_RPCTIMESYNC
257 * flag set for NFS V4 since we are avoiding to contact the rpcbind
258 * daemon and is using the IP time service (IPPORT_TIMESERVER).
260 * sec_data can be freed by sec_clnt_freeinfo().
262 static struct sec_data *
263 create_authdh_data(char *netname, int nlen, struct netbuf *syncaddr,
264 struct knetconfig *knconf) {
265 struct sec_data *secdata;
266 dh_k4_clntdata_t *data;
267 char *pf, *p;
269 if (syncaddr == NULL || syncaddr->buf == NULL || nlen == 0)
270 return (NULL);
272 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
273 secdata->flags = 0;
275 data = kmem_alloc(sizeof (*data), KM_SLEEP);
277 data->syncaddr.maxlen = syncaddr->maxlen;
278 data->syncaddr.len = syncaddr->len;
279 data->syncaddr.buf = kmem_alloc(syncaddr->len, KM_SLEEP);
280 bcopy(syncaddr->buf, data->syncaddr.buf, syncaddr->len);
283 * duplicate the knconf information for the
284 * new opaque data.
286 data->knconf = kmem_alloc(sizeof (*knconf), KM_SLEEP);
287 *data->knconf = *knconf;
288 pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
289 p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
290 bcopy(knconf->knc_protofmly, pf, KNC_STRSIZE);
291 bcopy(knconf->knc_proto, p, KNC_STRSIZE);
292 data->knconf->knc_protofmly = pf;
293 data->knconf->knc_proto = p;
295 /* move server netname to the sec_data structure */
296 data->netname = kmem_alloc(nlen, KM_SLEEP);
297 bcopy(netname, data->netname, nlen);
298 data->netnamelen = (int)nlen;
300 secdata->secmod = AUTH_DH;
301 secdata->rpcflavor = AUTH_DH;
302 secdata->data = (caddr_t)data;
304 return (secdata);
308 * Returns (deep) copy of sec_data_t. Allocates all memory required; caller
309 * is responsible for freeing.
311 sec_data_t *
312 copy_sec_data(sec_data_t *fsecdata) {
313 sec_data_t *tsecdata;
315 if (fsecdata == NULL)
316 return (NULL);
318 if (fsecdata->rpcflavor == AUTH_DH) {
319 dh_k4_clntdata_t *fdata = (dh_k4_clntdata_t *)fsecdata->data;
321 if (fdata == NULL)
322 return (NULL);
324 tsecdata = (sec_data_t *)create_authdh_data(fdata->netname,
325 fdata->netnamelen, &fdata->syncaddr, fdata->knconf);
327 return (tsecdata);
330 tsecdata = kmem_zalloc(sizeof (sec_data_t), KM_SLEEP);
332 tsecdata->secmod = fsecdata->secmod;
333 tsecdata->rpcflavor = fsecdata->rpcflavor;
334 tsecdata->flags = fsecdata->flags;
335 tsecdata->uid = fsecdata->uid;
337 if (fsecdata->rpcflavor == RPCSEC_GSS) {
338 gss_clntdata_t *gcd = (gss_clntdata_t *)fsecdata->data;
340 tsecdata->data = (caddr_t)copy_sec_data_gss(gcd);
341 } else {
342 tsecdata->data = NULL;
345 return (tsecdata);
348 gss_clntdata_t *
349 copy_sec_data_gss(gss_clntdata_t *fdata)
351 gss_clntdata_t *tdata;
353 if (fdata == NULL)
354 return (NULL);
356 tdata = kmem_zalloc(sizeof (gss_clntdata_t), KM_SLEEP);
358 tdata->mechanism.length = fdata->mechanism.length;
359 tdata->mechanism.elements = kmem_zalloc(fdata->mechanism.length,
360 KM_SLEEP);
361 bcopy(fdata->mechanism.elements, tdata->mechanism.elements,
362 fdata->mechanism.length);
364 tdata->service = fdata->service;
366 (void) strcpy(tdata->uname, fdata->uname);
367 (void) strcpy(tdata->inst, fdata->inst);
368 (void) strcpy(tdata->realm, fdata->realm);
370 tdata->qop = fdata->qop;
372 return (tdata);
375 static int
376 nfs4_chkdup_servinfo4(servinfo4_t *svp_head, servinfo4_t *svp)
378 servinfo4_t *si;
381 * Iterate over the servinfo4 list to make sure
382 * we do not have a duplicate. Skip any servinfo4
383 * that has been marked "NOT IN USE"
385 for (si = svp_head; si; si = si->sv_next) {
386 (void) nfs_rw_enter_sig(&si->sv_lock, RW_READER, 0);
387 if (si->sv_flags & SV4_NOTINUSE) {
388 nfs_rw_exit(&si->sv_lock);
389 continue;
391 nfs_rw_exit(&si->sv_lock);
392 if (si == svp)
393 continue;
394 if (si->sv_addr.len == svp->sv_addr.len &&
395 strcmp(si->sv_knconf->knc_protofmly,
396 svp->sv_knconf->knc_protofmly) == 0 &&
397 bcmp(si->sv_addr.buf, svp->sv_addr.buf,
398 si->sv_addr.len) == 0) {
399 /* it's a duplicate */
400 return (1);
403 /* it's not a duplicate */
404 return (0);
407 void
408 nfs4_free_args(struct nfs_args *nargs)
410 if (nargs->knconf) {
411 if (nargs->knconf->knc_protofmly)
412 kmem_free(nargs->knconf->knc_protofmly,
413 KNC_STRSIZE);
414 if (nargs->knconf->knc_proto)
415 kmem_free(nargs->knconf->knc_proto, KNC_STRSIZE);
416 kmem_free(nargs->knconf, sizeof (*nargs->knconf));
417 nargs->knconf = NULL;
420 if (nargs->fh) {
421 kmem_free(nargs->fh, strlen(nargs->fh) + 1);
422 nargs->fh = NULL;
425 if (nargs->hostname) {
426 kmem_free(nargs->hostname, strlen(nargs->hostname) + 1);
427 nargs->hostname = NULL;
430 if (nargs->addr) {
431 if (nargs->addr->buf) {
432 ASSERT(nargs->addr->len);
433 kmem_free(nargs->addr->buf, nargs->addr->len);
435 kmem_free(nargs->addr, sizeof (struct netbuf));
436 nargs->addr = NULL;
439 if (nargs->syncaddr) {
440 ASSERT(nargs->syncaddr->len);
441 if (nargs->syncaddr->buf) {
442 ASSERT(nargs->syncaddr->len);
443 kmem_free(nargs->syncaddr->buf, nargs->syncaddr->len);
445 kmem_free(nargs->syncaddr, sizeof (struct netbuf));
446 nargs->syncaddr = NULL;
449 if (nargs->netname) {
450 kmem_free(nargs->netname, strlen(nargs->netname) + 1);
451 nargs->netname = NULL;
454 if (nargs->nfs_ext_u.nfs_extA.secdata) {
455 sec_clnt_freeinfo(
456 nargs->nfs_ext_u.nfs_extA.secdata);
457 nargs->nfs_ext_u.nfs_extA.secdata = NULL;
463 nfs4_copyin(char *data, int datalen, struct nfs_args *nargs)
466 int error;
467 size_t hlen; /* length of hostname */
468 size_t nlen; /* length of netname */
469 char netname[MAXNETNAMELEN+1]; /* server's netname */
470 struct netbuf addr; /* server's address */
471 struct netbuf syncaddr; /* AUTH_DES time sync addr */
472 struct knetconfig *knconf; /* transport structure */
473 struct sec_data *secdata = NULL; /* security data */
474 STRUCT_DECL(nfs_args, args); /* nfs mount arguments */
475 STRUCT_DECL(knetconfig, knconf_tmp);
476 STRUCT_DECL(netbuf, addr_tmp);
477 int flags;
478 char *p, *pf;
479 struct pathname pn;
480 char *userbufptr;
483 bzero(nargs, sizeof (*nargs));
485 STRUCT_INIT(args, get_udatamodel());
486 bzero(STRUCT_BUF(args), SIZEOF_STRUCT(nfs_args, DATAMODEL_NATIVE));
487 if (copyin(data, STRUCT_BUF(args), MIN(datalen,
488 STRUCT_SIZE(args))))
489 return (EFAULT);
491 nargs->wsize = STRUCT_FGET(args, wsize);
492 nargs->rsize = STRUCT_FGET(args, rsize);
493 nargs->timeo = STRUCT_FGET(args, timeo);
494 nargs->retrans = STRUCT_FGET(args, retrans);
495 nargs->acregmin = STRUCT_FGET(args, acregmin);
496 nargs->acregmax = STRUCT_FGET(args, acregmax);
497 nargs->acdirmin = STRUCT_FGET(args, acdirmin);
498 nargs->acdirmax = STRUCT_FGET(args, acdirmax);
500 flags = STRUCT_FGET(args, flags);
501 nargs->flags = flags;
503 addr.buf = NULL;
504 syncaddr.buf = NULL;
508 * Allocate space for a knetconfig structure and
509 * its strings and copy in from user-land.
511 knconf = kmem_zalloc(sizeof (*knconf), KM_SLEEP);
512 STRUCT_INIT(knconf_tmp, get_udatamodel());
513 if (copyin(STRUCT_FGETP(args, knconf), STRUCT_BUF(knconf_tmp),
514 STRUCT_SIZE(knconf_tmp))) {
515 kmem_free(knconf, sizeof (*knconf));
516 return (EFAULT);
519 knconf->knc_semantics = STRUCT_FGET(knconf_tmp, knc_semantics);
520 knconf->knc_protofmly = STRUCT_FGETP(knconf_tmp, knc_protofmly);
521 knconf->knc_proto = STRUCT_FGETP(knconf_tmp, knc_proto);
522 if (get_udatamodel() != DATAMODEL_LP64) {
523 knconf->knc_rdev = expldev(STRUCT_FGET(knconf_tmp, knc_rdev));
524 } else {
525 knconf->knc_rdev = STRUCT_FGET(knconf_tmp, knc_rdev);
528 pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
529 p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
530 error = copyinstr(knconf->knc_protofmly, pf, KNC_STRSIZE, NULL);
531 if (error) {
532 kmem_free(pf, KNC_STRSIZE);
533 kmem_free(p, KNC_STRSIZE);
534 kmem_free(knconf, sizeof (*knconf));
535 return (error);
538 error = copyinstr(knconf->knc_proto, p, KNC_STRSIZE, NULL);
539 if (error) {
540 kmem_free(pf, KNC_STRSIZE);
541 kmem_free(p, KNC_STRSIZE);
542 kmem_free(knconf, sizeof (*knconf));
543 return (error);
547 knconf->knc_protofmly = pf;
548 knconf->knc_proto = p;
550 nargs->knconf = knconf;
553 * Get server address
555 STRUCT_INIT(addr_tmp, get_udatamodel());
556 if (copyin(STRUCT_FGETP(args, addr), STRUCT_BUF(addr_tmp),
557 STRUCT_SIZE(addr_tmp))) {
558 error = EFAULT;
559 goto errout;
562 nargs->addr = kmem_zalloc(sizeof (struct netbuf), KM_SLEEP);
563 userbufptr = STRUCT_FGETP(addr_tmp, buf);
564 addr.len = STRUCT_FGET(addr_tmp, len);
565 addr.buf = kmem_alloc(addr.len, KM_SLEEP);
566 addr.maxlen = addr.len;
567 if (copyin(userbufptr, addr.buf, addr.len)) {
568 kmem_free(addr.buf, addr.len);
569 error = EFAULT;
570 goto errout;
572 bcopy(&addr, nargs->addr, sizeof (struct netbuf));
575 * Get the root fhandle
577 error = pn_get(STRUCT_FGETP(args, fh), UIO_USERSPACE, &pn);
578 if (error)
579 goto errout;
581 /* Volatile fh: keep server paths, so use actual-size strings */
582 nargs->fh = kmem_alloc(pn.pn_pathlen + 1, KM_SLEEP);
583 bcopy(pn.pn_path, nargs->fh, pn.pn_pathlen);
584 nargs->fh[pn.pn_pathlen] = '\0';
585 pn_free(&pn);
589 * Get server's hostname
591 if (flags & NFSMNT_HOSTNAME) {
592 error = copyinstr(STRUCT_FGETP(args, hostname),
593 netname, sizeof (netname), &hlen);
594 if (error)
595 goto errout;
596 nargs->hostname = kmem_zalloc(hlen, KM_SLEEP);
597 (void) strcpy(nargs->hostname, netname);
599 } else {
600 nargs->hostname = NULL;
605 * If there are syncaddr and netname data, load them in. This is
606 * to support data needed for NFSV4 when AUTH_DH is the negotiated
607 * flavor via SECINFO. (instead of using MOUNT protocol in V3).
609 netname[0] = '\0';
610 if (flags & NFSMNT_SECURE) {
612 /* get syncaddr */
613 STRUCT_INIT(addr_tmp, get_udatamodel());
614 if (copyin(STRUCT_FGETP(args, syncaddr), STRUCT_BUF(addr_tmp),
615 STRUCT_SIZE(addr_tmp))) {
616 error = EINVAL;
617 goto errout;
619 userbufptr = STRUCT_FGETP(addr_tmp, buf);
620 syncaddr.len = STRUCT_FGET(addr_tmp, len);
621 syncaddr.buf = kmem_alloc(syncaddr.len, KM_SLEEP);
622 syncaddr.maxlen = syncaddr.len;
623 if (copyin(userbufptr, syncaddr.buf, syncaddr.len)) {
624 kmem_free(syncaddr.buf, syncaddr.len);
625 error = EFAULT;
626 goto errout;
629 nargs->syncaddr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
630 bcopy(&syncaddr, nargs->syncaddr, sizeof (struct netbuf));
632 /* get server's netname */
633 if (copyinstr(STRUCT_FGETP(args, netname), netname,
634 sizeof (netname), &nlen)) {
635 error = EFAULT;
636 goto errout;
639 netname[nlen] = '\0';
640 nargs->netname = kmem_zalloc(nlen, KM_SLEEP);
641 (void) strcpy(nargs->netname, netname);
645 * Get the extention data which has the security data structure.
646 * This includes data for AUTH_SYS as well.
648 if (flags & NFSMNT_NEWARGS) {
649 nargs->nfs_args_ext = STRUCT_FGET(args, nfs_args_ext);
650 if (nargs->nfs_args_ext == NFS_ARGS_EXTA ||
651 nargs->nfs_args_ext == NFS_ARGS_EXTB) {
653 * Indicating the application is using the new
654 * sec_data structure to pass in the security
655 * data.
657 if (STRUCT_FGETP(args,
658 nfs_ext_u.nfs_extA.secdata) != NULL) {
659 error = sec_clnt_loadinfo(
660 (struct sec_data *)STRUCT_FGETP(args,
661 nfs_ext_u.nfs_extA.secdata),
662 &secdata, get_udatamodel());
664 nargs->nfs_ext_u.nfs_extA.secdata = secdata;
668 if (error)
669 goto errout;
672 * Failover support:
674 * We may have a linked list of nfs_args structures,
675 * which means the user is looking for failover. If
676 * the mount is either not "read-only" or "soft",
677 * we want to bail out with EINVAL.
679 if (nargs->nfs_args_ext == NFS_ARGS_EXTB)
680 nargs->nfs_ext_u.nfs_extB.next =
681 STRUCT_FGETP(args, nfs_ext_u.nfs_extB.next);
683 errout:
684 if (error)
685 nfs4_free_args(nargs);
687 return (error);
692 * nfs mount vfsop
693 * Set up mount info record and attach it to vfs struct.
696 nfs4_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
698 char *data = uap->dataptr;
699 int error;
700 vnode_t *rtvp; /* the server's root */
701 mntinfo4_t *mi; /* mount info, pointed at by vfs */
702 struct knetconfig *rdma_knconf; /* rdma transport structure */
703 rnode4_t *rp;
704 struct servinfo4 *svp; /* nfs server info */
705 struct servinfo4 *svp_tail = NULL; /* previous nfs server info */
706 struct servinfo4 *svp_head; /* first nfs server info */
707 struct servinfo4 *svp_2ndlast; /* 2nd last in server info list */
708 struct sec_data *secdata; /* security data */
709 struct nfs_args *args = NULL;
710 int flags, addr_type, removed;
711 zone_t *zone = nfs_zone();
712 nfs4_error_t n4e;
713 zone_t *mntzone = NULL;
715 if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
716 return (EPERM);
717 if (mvp->v_type != VDIR)
718 return (ENOTDIR);
721 * get arguments
723 * nfs_args is now versioned and is extensible, so
724 * uap->datalen might be different from sizeof (args)
725 * in a compatible situation.
727 more:
728 if (!(uap->flags & MS_SYSSPACE)) {
729 if (args == NULL)
730 args = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
731 else
732 nfs4_free_args(args);
733 error = nfs4_copyin(data, uap->datalen, args);
734 if (error) {
735 if (args) {
736 kmem_free(args, sizeof (*args));
738 return (error);
740 } else {
741 args = (struct nfs_args *)data;
744 flags = args->flags;
747 * If the request changes the locking type, disallow the remount,
748 * because it's questionable whether we can transfer the
749 * locking state correctly.
751 if (uap->flags & MS_REMOUNT) {
752 if (!(uap->flags & MS_SYSSPACE)) {
753 nfs4_free_args(args);
754 kmem_free(args, sizeof (*args));
756 if ((mi = VFTOMI4(vfsp)) != NULL) {
757 uint_t new_mi_llock;
758 uint_t old_mi_llock;
759 new_mi_llock = (flags & NFSMNT_LLOCK) ? 1 : 0;
760 old_mi_llock = (mi->mi_flags & MI4_LLOCK) ? 1 : 0;
761 if (old_mi_llock != new_mi_llock)
762 return (EBUSY);
764 return (0);
768 * For ephemeral mount trigger stub vnodes, we have two problems
769 * to solve: racing threads will likely fail the v_count check, and
770 * we want only one to proceed with the mount.
772 * For stubs, if the mount has already occurred (via a racing thread),
773 * just return success. If not, skip the v_count check and proceed.
774 * Note that we are already serialised at this point.
776 mutex_enter(&mvp->v_lock);
777 if (vn_matchops(mvp, nfs4_trigger_vnodeops)) {
778 /* mntpt is a v4 stub vnode */
779 ASSERT(RP_ISSTUB(VTOR4(mvp)));
780 ASSERT(!(uap->flags & MS_OVERLAY));
781 ASSERT(!(mvp->v_flag & VROOT));
782 if (vn_mountedvfs(mvp) != NULL) {
783 /* ephemeral mount has already occurred */
784 ASSERT(uap->flags & MS_SYSSPACE);
785 mutex_exit(&mvp->v_lock);
786 return (0);
788 } else {
789 /* mntpt is a non-v4 or v4 non-stub vnode */
790 if (!(uap->flags & MS_OVERLAY) &&
791 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
792 mutex_exit(&mvp->v_lock);
793 if (!(uap->flags & MS_SYSSPACE)) {
794 nfs4_free_args(args);
795 kmem_free(args, sizeof (*args));
797 return (EBUSY);
800 mutex_exit(&mvp->v_lock);
802 /* make sure things are zeroed for errout: */
803 rtvp = NULL;
804 mi = NULL;
805 secdata = NULL;
808 * A valid knetconfig structure is required.
810 if (!(flags & NFSMNT_KNCONF) ||
811 args->knconf == NULL || args->knconf->knc_protofmly == NULL ||
812 args->knconf->knc_proto == NULL ||
813 (strcmp(args->knconf->knc_proto, NC_UDP) == 0)) {
814 if (!(uap->flags & MS_SYSSPACE)) {
815 nfs4_free_args(args);
816 kmem_free(args, sizeof (*args));
818 return (EINVAL);
821 if ((strlen(args->knconf->knc_protofmly) >= KNC_STRSIZE) ||
822 (strlen(args->knconf->knc_proto) >= KNC_STRSIZE)) {
823 if (!(uap->flags & MS_SYSSPACE)) {
824 nfs4_free_args(args);
825 kmem_free(args, sizeof (*args));
827 return (EINVAL);
831 * Allocate a servinfo4 struct.
833 svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
834 nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
835 if (svp_tail) {
836 svp_2ndlast = svp_tail;
837 svp_tail->sv_next = svp;
838 } else {
839 svp_head = svp;
840 svp_2ndlast = svp;
843 svp_tail = svp;
844 svp->sv_knconf = args->knconf;
845 args->knconf = NULL;
848 * Get server address
850 if (args->addr == NULL || args->addr->buf == NULL) {
851 error = EINVAL;
852 goto errout;
855 svp->sv_addr.maxlen = args->addr->maxlen;
856 svp->sv_addr.len = args->addr->len;
857 svp->sv_addr.buf = args->addr->buf;
858 args->addr->buf = NULL;
861 * Get the root fhandle
863 if (args->fh == NULL || (strlen(args->fh) >= MAXPATHLEN)) {
864 error = EINVAL;
865 goto errout;
868 svp->sv_path = args->fh;
869 svp->sv_pathlen = strlen(args->fh) + 1;
870 args->fh = NULL;
873 * Get server's hostname
875 if (flags & NFSMNT_HOSTNAME) {
876 if (args->hostname == NULL || (strlen(args->hostname) >
877 MAXNETNAMELEN)) {
878 error = EINVAL;
879 goto errout;
881 svp->sv_hostnamelen = strlen(args->hostname) + 1;
882 svp->sv_hostname = args->hostname;
883 args->hostname = NULL;
884 } else {
885 char *p = "unknown-host";
886 svp->sv_hostnamelen = strlen(p) + 1;
887 svp->sv_hostname = kmem_zalloc(svp->sv_hostnamelen, KM_SLEEP);
888 (void) strcpy(svp->sv_hostname, p);
892 * RDMA MOUNT SUPPORT FOR NFS v4.
893 * Establish, is it possible to use RDMA, if so overload the
894 * knconf with rdma specific knconf and free the orignal knconf.
896 if ((flags & NFSMNT_TRYRDMA) || (flags & NFSMNT_DORDMA)) {
898 * Determine the addr type for RDMA, IPv4 or v6.
900 if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET) == 0)
901 addr_type = AF_INET;
902 else if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET6) == 0)
903 addr_type = AF_INET6;
905 if (rdma_reachable(addr_type, &svp->sv_addr,
906 &rdma_knconf) == 0) {
908 * If successful, hijack the orignal knconf and
909 * replace with the new one, depending on the flags.
911 svp->sv_origknconf = svp->sv_knconf;
912 svp->sv_knconf = rdma_knconf;
913 } else {
914 if (flags & NFSMNT_TRYRDMA) {
915 #ifdef DEBUG
916 if (rdma_debug)
917 zcmn_err(getzoneid(), CE_WARN,
918 "no RDMA onboard, revert\n");
919 #endif
922 if (flags & NFSMNT_DORDMA) {
924 * If proto=rdma is specified and no RDMA
925 * path to this server is avialable then
926 * ditch this server.
927 * This is not included in the mountable
928 * server list or the replica list.
929 * Check if more servers are specified;
930 * Failover case, otherwise bail out of mount.
932 if (args->nfs_args_ext == NFS_ARGS_EXTB &&
933 args->nfs_ext_u.nfs_extB.next != NULL) {
934 data = (char *)
935 args->nfs_ext_u.nfs_extB.next;
936 if (uap->flags & MS_RDONLY &&
937 !(flags & NFSMNT_SOFT)) {
938 if (svp_head->sv_next == NULL) {
939 svp_tail = NULL;
940 svp_2ndlast = NULL;
941 sv4_free(svp_head);
942 goto more;
943 } else {
944 svp_tail = svp_2ndlast;
945 svp_2ndlast->sv_next =
946 NULL;
947 sv4_free(svp);
948 goto more;
951 } else {
953 * This is the last server specified
954 * in the nfs_args list passed down
955 * and its not rdma capable.
957 if (svp_head->sv_next == NULL) {
959 * Is this the only one
961 error = EINVAL;
962 #ifdef DEBUG
963 if (rdma_debug)
964 zcmn_err(getzoneid(),
965 CE_WARN,
966 "No RDMA srv");
967 #endif
968 goto errout;
969 } else {
971 * There is list, since some
972 * servers specified before
973 * this passed all requirements
975 svp_tail = svp_2ndlast;
976 svp_2ndlast->sv_next = NULL;
977 sv4_free(svp);
978 goto proceed;
986 * If there are syncaddr and netname data, load them in. This is
987 * to support data needed for NFSV4 when AUTH_DH is the negotiated
988 * flavor via SECINFO. (instead of using MOUNT protocol in V3).
990 if (args->flags & NFSMNT_SECURE) {
991 svp->sv_dhsec = create_authdh_data(args->netname,
992 strlen(args->netname),
993 args->syncaddr, svp->sv_knconf);
997 * Get the extention data which has the security data structure.
998 * This includes data for AUTH_SYS as well.
1000 if (flags & NFSMNT_NEWARGS) {
1001 switch (args->nfs_args_ext) {
1002 case NFS_ARGS_EXTA:
1003 case NFS_ARGS_EXTB:
1005 * Indicating the application is using the new
1006 * sec_data structure to pass in the security
1007 * data.
1009 secdata = args->nfs_ext_u.nfs_extA.secdata;
1010 if (secdata == NULL) {
1011 error = EINVAL;
1012 } else if (uap->flags & MS_SYSSPACE) {
1014 * Need to validate the flavor here if
1015 * sysspace, userspace was already
1016 * validate from the nfs_copyin function.
1018 switch (secdata->rpcflavor) {
1019 case AUTH_NONE:
1020 case AUTH_UNIX:
1021 case AUTH_LOOPBACK:
1022 case AUTH_DES:
1023 case RPCSEC_GSS:
1024 break;
1025 default:
1026 error = EINVAL;
1027 goto errout;
1030 args->nfs_ext_u.nfs_extA.secdata = NULL;
1031 break;
1033 default:
1034 error = EINVAL;
1035 break;
1038 } else if (flags & NFSMNT_SECURE) {
1040 * NFSMNT_SECURE is deprecated but we keep it
1041 * to support the rogue user-generated application
1042 * that may use this undocumented interface to do
1043 * AUTH_DH security, e.g. our own rexd.
1045 * Also note that NFSMNT_SECURE is used for passing
1046 * AUTH_DH info to be used in negotiation.
1048 secdata = create_authdh_data(args->netname,
1049 strlen(args->netname), args->syncaddr, svp->sv_knconf);
1051 } else {
1052 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1053 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
1054 secdata->data = NULL;
1057 svp->sv_secdata = secdata;
1060 * User does not explictly specify a flavor, and a user
1061 * defined default flavor is passed down.
1063 if (flags & NFSMNT_SECDEFAULT) {
1064 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1065 svp->sv_flags |= SV4_TRYSECDEFAULT;
1066 nfs_rw_exit(&svp->sv_lock);
1070 * Failover support:
1072 * We may have a linked list of nfs_args structures,
1073 * which means the user is looking for failover. If
1074 * the mount is either not "read-only" or "soft",
1075 * we want to bail out with EINVAL.
1077 if (args->nfs_args_ext == NFS_ARGS_EXTB &&
1078 args->nfs_ext_u.nfs_extB.next != NULL) {
1079 if (uap->flags & MS_RDONLY && !(flags & NFSMNT_SOFT)) {
1080 data = (char *)args->nfs_ext_u.nfs_extB.next;
1081 goto more;
1083 error = EINVAL;
1084 goto errout;
1088 * Determine the zone we're being mounted into.
1090 zone_hold(mntzone = zone); /* start with this assumption */
1091 if (getzoneid() == GLOBAL_ZONEID) {
1092 zone_rele(mntzone);
1093 mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
1094 ASSERT(mntzone != NULL);
1095 if (mntzone != zone) {
1096 error = EBUSY;
1097 goto errout;
1102 * Stop the mount from going any further if the zone is going away.
1104 if (zone_status_get(mntzone) >= ZONE_IS_SHUTTING_DOWN) {
1105 error = EBUSY;
1106 goto errout;
1110 * Get root vnode.
1112 proceed:
1113 error = nfs4rootvp(&rtvp, vfsp, svp_head, flags, cr, mntzone);
1114 if (error) {
1115 /* if nfs4rootvp failed, it will free svp_head */
1116 svp_head = NULL;
1117 goto errout;
1120 mi = VTOMI4(rtvp);
1123 * Send client id to the server, if necessary
1125 nfs4_error_zinit(&n4e);
1126 nfs4setclientid(mi, cr, FALSE, &n4e);
1128 error = n4e.error;
1130 if (error)
1131 goto errout;
1134 * Set option fields in the mount info record
1137 if (svp_head->sv_next) {
1138 mutex_enter(&mi->mi_lock);
1139 mi->mi_flags |= MI4_LLOCK;
1140 mutex_exit(&mi->mi_lock);
1142 error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, args);
1143 if (error)
1144 goto errout;
1147 * Time to tie in the mirror mount info at last!
1149 if (flags & NFSMNT_EPHEMERAL)
1150 error = nfs4_record_ephemeral_mount(mi, mvp);
1152 errout:
1153 if (error) {
1154 if (rtvp != NULL) {
1155 rp = VTOR4(rtvp);
1156 if (rp->r_flags & R4HASHED)
1157 rp4_rmhash(rp);
1159 if (mi != NULL) {
1160 nfs4_async_stop(vfsp);
1161 nfs4_async_manager_stop(vfsp);
1162 nfs4_remove_mi_from_server(mi, NULL);
1163 if (rtvp != NULL)
1164 VN_RELE(rtvp);
1165 if (mntzone != NULL)
1166 zone_rele(mntzone);
1167 /* need to remove it from the zone */
1168 removed = nfs4_mi_zonelist_remove(mi);
1169 if (removed)
1170 zone_rele_ref(&mi->mi_zone_ref,
1171 ZONE_REF_NFSV4);
1172 MI4_RELE(mi);
1173 if (!(uap->flags & MS_SYSSPACE) && args) {
1174 nfs4_free_args(args);
1175 kmem_free(args, sizeof (*args));
1177 return (error);
1179 if (svp_head)
1180 sv4_free(svp_head);
1183 if (!(uap->flags & MS_SYSSPACE) && args) {
1184 nfs4_free_args(args);
1185 kmem_free(args, sizeof (*args));
1187 if (rtvp != NULL)
1188 VN_RELE(rtvp);
1190 if (mntzone != NULL)
1191 zone_rele(mntzone);
1193 return (error);
1196 #ifdef DEBUG
1197 #define VERS_MSG "NFS4 server "
1198 #else
1199 #define VERS_MSG "NFS server "
1200 #endif
1202 #define READ_MSG \
1203 VERS_MSG "%s returned 0 for read transfer size"
1204 #define WRITE_MSG \
1205 VERS_MSG "%s returned 0 for write transfer size"
1206 #define SIZE_MSG \
1207 VERS_MSG "%s returned 0 for maximum file size"
1210 * Get the symbolic link text from the server for a given filehandle
1211 * of that symlink.
1213 * (get symlink text) PUTFH READLINK
1215 static int
1216 getlinktext_otw(mntinfo4_t *mi, nfs_fh4 *fh, char **linktextp, cred_t *cr,
1217 int flags)
1219 COMPOUND4args_clnt args;
1220 COMPOUND4res_clnt res;
1221 int doqueue;
1222 nfs_argop4 argop[2];
1223 nfs_resop4 *resop;
1224 READLINK4res *lr_res;
1225 uint_t len;
1226 bool_t needrecov = FALSE;
1227 nfs4_recov_state_t recov_state;
1228 nfs4_sharedfh_t *sfh;
1229 nfs4_error_t e;
1230 int num_retry = nfs4_max_mount_retry;
1231 int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1233 sfh = sfh4_get(fh, mi);
1234 recov_state.rs_flags = 0;
1235 recov_state.rs_num_retry_despite_err = 0;
1237 recov_retry:
1238 nfs4_error_zinit(&e);
1240 args.array_len = 2;
1241 args.array = argop;
1242 args.ctag = TAG_GET_SYMLINK;
1244 if (! recovery) {
1245 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
1246 if (e.error) {
1247 sfh4_rele(&sfh);
1248 return (e.error);
1252 /* 0. putfh symlink fh */
1253 argop[0].argop = OP_CPUTFH;
1254 argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1256 /* 1. readlink */
1257 argop[1].argop = OP_READLINK;
1259 doqueue = 1;
1261 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
1263 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1265 if (needrecov && !recovery && num_retry-- > 0) {
1267 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1268 "getlinktext_otw: initiating recovery\n"));
1270 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
1271 OP_READLINK, NULL, NULL, NULL) == FALSE) {
1272 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1273 if (!e.error)
1274 (void) xdr_free(xdr_COMPOUND4res_clnt,
1275 (caddr_t)&res);
1276 goto recov_retry;
1281 * If non-NFS4 pcol error and/or we weren't able to recover.
1283 if (e.error != 0) {
1284 if (! recovery)
1285 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1286 sfh4_rele(&sfh);
1287 return (e.error);
1290 if (res.status) {
1291 e.error = geterrno4(res.status);
1292 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1293 if (! recovery)
1294 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1295 sfh4_rele(&sfh);
1296 return (e.error);
1299 /* res.status == NFS4_OK */
1300 ASSERT(res.status == NFS4_OK);
1302 resop = &res.array[1]; /* readlink res */
1303 lr_res = &resop->nfs_resop4_u.opreadlink;
1305 /* treat symlink name as data */
1306 *linktextp = utf8_to_str((utf8string *)&lr_res->link, &len, NULL);
1308 if (! recovery)
1309 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1310 sfh4_rele(&sfh);
1311 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1312 return (0);
1316 * Skip over consecutive slashes and "/./" in a pathname.
1318 void
1319 pathname_skipslashdot(struct pathname *pnp)
1321 char *c1, *c2;
1323 while (pnp->pn_pathlen > 0 && *pnp->pn_path == '/') {
1325 c1 = pnp->pn_path + 1;
1326 c2 = pnp->pn_path + 2;
1328 if (*c1 == '.' && (*c2 == '/' || *c2 == '\0')) {
1329 pnp->pn_path = pnp->pn_path + 2; /* skip "/." */
1330 pnp->pn_pathlen = pnp->pn_pathlen - 2;
1331 } else {
1332 pnp->pn_path++;
1333 pnp->pn_pathlen--;
1339 * Resolve a symbolic link path. The symlink is in the nth component of
1340 * svp->sv_path and has an nfs4 file handle "fh".
1341 * Upon return, the sv_path will point to the new path that has the nth
1342 * component resolved to its symlink text.
1345 resolve_sympath(mntinfo4_t *mi, servinfo4_t *svp, int nth, nfs_fh4 *fh,
1346 cred_t *cr, int flags)
1348 char *oldpath;
1349 char *symlink, *newpath;
1350 struct pathname oldpn, newpn;
1351 char component[MAXNAMELEN];
1352 int i, addlen, error = 0;
1353 int oldpathlen;
1355 /* Get the symbolic link text over the wire. */
1356 error = getlinktext_otw(mi, fh, &symlink, cr, flags);
1358 if (error || symlink == NULL || strlen(symlink) == 0)
1359 return (error);
1362 * Compose the new pathname.
1363 * Note:
1364 * - only the nth component is resolved for the pathname.
1365 * - pathname.pn_pathlen does not count the ending null byte.
1367 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1368 oldpath = svp->sv_path;
1369 oldpathlen = svp->sv_pathlen;
1370 if (error = pn_get(oldpath, UIO_SYSSPACE, &oldpn)) {
1371 nfs_rw_exit(&svp->sv_lock);
1372 kmem_free(symlink, strlen(symlink) + 1);
1373 return (error);
1375 nfs_rw_exit(&svp->sv_lock);
1376 pn_alloc(&newpn);
1379 * Skip over previous components from the oldpath so that the
1380 * oldpn.pn_path will point to the symlink component. Skip
1381 * leading slashes and "/./" (no OP_LOOKUP on ".") so that
1382 * pn_getcompnent can get the component.
1384 for (i = 1; i < nth; i++) {
1385 pathname_skipslashdot(&oldpn);
1386 error = pn_getcomponent(&oldpn, component);
1387 if (error)
1388 goto out;
1392 * Copy the old path upto the component right before the symlink
1393 * if the symlink is not an absolute path.
1395 if (symlink[0] != '/') {
1396 addlen = oldpn.pn_path - oldpn.pn_buf;
1397 bcopy(oldpn.pn_buf, newpn.pn_path, addlen);
1398 newpn.pn_pathlen += addlen;
1399 newpn.pn_path += addlen;
1400 newpn.pn_buf[newpn.pn_pathlen] = '/';
1401 newpn.pn_pathlen++;
1402 newpn.pn_path++;
1405 /* copy the resolved symbolic link text */
1406 addlen = strlen(symlink);
1407 if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1408 error = ENAMETOOLONG;
1409 goto out;
1411 bcopy(symlink, newpn.pn_path, addlen);
1412 newpn.pn_pathlen += addlen;
1413 newpn.pn_path += addlen;
1416 * Check if there is any remaining path after the symlink component.
1417 * First, skip the symlink component.
1419 pathname_skipslashdot(&oldpn);
1420 if (error = pn_getcomponent(&oldpn, component))
1421 goto out;
1423 addlen = pn_pathleft(&oldpn); /* includes counting the slash */
1426 * Copy the remaining path to the new pathname if there is any.
1428 if (addlen > 0) {
1429 if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1430 error = ENAMETOOLONG;
1431 goto out;
1433 bcopy(oldpn.pn_path, newpn.pn_path, addlen);
1434 newpn.pn_pathlen += addlen;
1436 newpn.pn_buf[newpn.pn_pathlen] = '\0';
1438 /* get the newpath and store it in the servinfo4_t */
1439 newpath = kmem_alloc(newpn.pn_pathlen + 1, KM_SLEEP);
1440 bcopy(newpn.pn_buf, newpath, newpn.pn_pathlen);
1441 newpath[newpn.pn_pathlen] = '\0';
1443 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1444 svp->sv_path = newpath;
1445 svp->sv_pathlen = strlen(newpath) + 1;
1446 nfs_rw_exit(&svp->sv_lock);
1448 kmem_free(oldpath, oldpathlen);
1449 out:
1450 kmem_free(symlink, strlen(symlink) + 1);
1451 pn_free(&newpn);
1452 pn_free(&oldpn);
1454 return (error);
1458 * This routine updates servinfo4 structure with the new referred server
1459 * info.
1460 * nfsfsloc has the location related information
1461 * fsp has the hostname and pathname info.
1462 * new path = pathname from referral + part of orig pathname(based on nth).
1464 static void
1465 update_servinfo4(servinfo4_t *svp, fs_location4 *fsp,
1466 struct nfs_fsl_info *nfsfsloc, char *orig_path, int nth)
1468 struct knetconfig *knconf, *svknconf;
1469 struct netbuf *saddr;
1470 sec_data_t *secdata;
1471 utf8string *host;
1472 int i = 0, num_slashes = 0;
1473 char *p, *spath, *op, *new_path;
1475 /* Update knconf */
1476 knconf = svp->sv_knconf;
1477 free_knconf_contents(knconf);
1478 bzero(knconf, sizeof (struct knetconfig));
1479 svknconf = nfsfsloc->knconf;
1480 knconf->knc_semantics = svknconf->knc_semantics;
1481 knconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1482 knconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
1483 knconf->knc_rdev = svknconf->knc_rdev;
1484 bcopy(svknconf->knc_protofmly, knconf->knc_protofmly, KNC_STRSIZE);
1485 bcopy(svknconf->knc_proto, knconf->knc_proto, KNC_STRSIZE);
1487 /* Update server address */
1488 saddr = &svp->sv_addr;
1489 if (saddr->buf != NULL)
1490 kmem_free(saddr->buf, saddr->maxlen);
1491 saddr->buf = kmem_alloc(nfsfsloc->addr->maxlen, KM_SLEEP);
1492 saddr->len = nfsfsloc->addr->len;
1493 saddr->maxlen = nfsfsloc->addr->maxlen;
1494 bcopy(nfsfsloc->addr->buf, saddr->buf, nfsfsloc->addr->len);
1496 /* Update server name */
1497 host = fsp->server_val;
1498 kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
1499 svp->sv_hostname = kmem_zalloc(host->utf8string_len + 1, KM_SLEEP);
1500 bcopy(host->utf8string_val, svp->sv_hostname, host->utf8string_len);
1501 svp->sv_hostname[host->utf8string_len] = '\0';
1502 svp->sv_hostnamelen = host->utf8string_len + 1;
1505 * Update server path.
1506 * We need to setup proper path here.
1507 * For ex., If we got a path name serv1:/rp/aaa/bbb
1508 * where aaa is a referral and points to serv2:/rpool/aa
1509 * we need to set the path to serv2:/rpool/aa/bbb
1510 * The first part of this below code generates /rpool/aa
1511 * and the second part appends /bbb to the server path.
1513 spath = p = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1514 *p++ = '/';
1515 for (i = 0; i < fsp->rootpath.pathname4_len; i++) {
1516 component4 *comp;
1518 comp = &fsp->rootpath.pathname4_val[i];
1519 /* If no space, null the string and bail */
1520 if ((p - spath) + comp->utf8string_len + 1 > MAXPATHLEN) {
1521 p = spath + MAXPATHLEN - 1;
1522 spath[0] = '\0';
1523 break;
1525 bcopy(comp->utf8string_val, p, comp->utf8string_len);
1526 p += comp->utf8string_len;
1527 *p++ = '/';
1529 if (fsp->rootpath.pathname4_len != 0)
1530 *(p - 1) = '\0';
1531 else
1532 *p = '\0';
1533 p = spath;
1535 new_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1536 (void) strlcpy(new_path, p, MAXPATHLEN);
1537 kmem_free(p, MAXPATHLEN);
1538 i = strlen(new_path);
1540 for (op = orig_path; *op; op++) {
1541 if (*op == '/')
1542 num_slashes++;
1543 if (num_slashes == nth + 2) {
1544 while (*op != '\0') {
1545 new_path[i] = *op;
1546 i++;
1547 op++;
1549 break;
1552 new_path[i] = '\0';
1554 kmem_free(svp->sv_path, svp->sv_pathlen);
1555 svp->sv_pathlen = strlen(new_path) + 1;
1556 svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
1557 bcopy(new_path, svp->sv_path, svp->sv_pathlen);
1558 kmem_free(new_path, MAXPATHLEN);
1561 * All the security data is specific to old server.
1562 * Clean it up except secdata which deals with mount options.
1563 * We need to inherit that data. Copy secdata into our new servinfo4.
1565 if (svp->sv_dhsec) {
1566 sec_clnt_freeinfo(svp->sv_dhsec);
1567 svp->sv_dhsec = NULL;
1569 if (svp->sv_save_secinfo &&
1570 svp->sv_save_secinfo != svp->sv_secinfo) {
1571 secinfo_free(svp->sv_save_secinfo);
1572 svp->sv_save_secinfo = NULL;
1574 if (svp->sv_secinfo) {
1575 secinfo_free(svp->sv_secinfo);
1576 svp->sv_secinfo = NULL;
1578 svp->sv_currsec = NULL;
1580 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1581 *secdata = *svp->sv_secdata;
1582 secdata->data = NULL;
1583 if (svp->sv_secdata) {
1584 sec_clnt_freeinfo(svp->sv_secdata);
1585 svp->sv_secdata = NULL;
1587 svp->sv_secdata = secdata;
1591 * Resolve a referral. The referral is in the n+1th component of
1592 * svp->sv_path and has a parent nfs4 file handle "fh".
1593 * Upon return, the sv_path will point to the new path that has referral
1594 * component resolved to its referred path and part of original path.
1595 * Hostname and other address information is also updated.
1598 resolve_referral(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, int nth,
1599 nfs_fh4 *fh)
1601 nfs4_sharedfh_t *sfh;
1602 struct nfs_fsl_info nfsfsloc;
1603 nfs4_ga_res_t garp;
1604 COMPOUND4res_clnt callres;
1605 fs_location4 *fsp;
1606 char *nm, *orig_path;
1607 int orig_pathlen = 0, ret = -1, index;
1609 if (svp->sv_pathlen <= 0)
1610 return (ret);
1612 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1613 orig_pathlen = svp->sv_pathlen;
1614 orig_path = kmem_alloc(orig_pathlen, KM_SLEEP);
1615 bcopy(svp->sv_path, orig_path, orig_pathlen);
1616 nm = extract_referral_point(svp->sv_path, nth);
1617 setup_newsvpath(svp, nth);
1618 nfs_rw_exit(&svp->sv_lock);
1620 sfh = sfh4_get(fh, mi);
1621 index = nfs4_process_referral(mi, sfh, nm, cr,
1622 &garp, &callres, &nfsfsloc);
1623 sfh4_rele(&sfh);
1624 kmem_free(nm, MAXPATHLEN);
1625 if (index < 0) {
1626 kmem_free(orig_path, orig_pathlen);
1627 return (index);
1630 fsp = &garp.n4g_ext_res->n4g_fslocations.locations_val[index];
1631 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1632 update_servinfo4(svp, fsp, &nfsfsloc, orig_path, nth);
1633 nfs_rw_exit(&svp->sv_lock);
1635 mutex_enter(&mi->mi_lock);
1636 mi->mi_vfs_referral_loop_cnt++;
1637 mutex_exit(&mi->mi_lock);
1639 ret = 0;
1640 bad:
1641 /* Free up XDR memory allocated in nfs4_process_referral() */
1642 xdr_free(xdr_nfs_fsl_info, (char *)&nfsfsloc);
1643 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&callres);
1644 kmem_free(orig_path, orig_pathlen);
1646 return (ret);
1650 * Get the root filehandle for the given filesystem and server, and update
1651 * svp.
1653 * If NFS4_GETFH_NEEDSOP is set, then use nfs4_start_fop and nfs4_end_fop
1654 * to coordinate with recovery. Otherwise, the caller is assumed to be
1655 * the recovery thread or have already done a start_fop.
1657 * Errors are returned by the nfs4_error_t parameter.
1659 static void
1660 nfs4getfh_otw(struct mntinfo4 *mi, servinfo4_t *svp, vtype_t *vtp,
1661 int flags, cred_t *cr, nfs4_error_t *ep)
1663 COMPOUND4args_clnt args;
1664 COMPOUND4res_clnt res;
1665 int doqueue = 1;
1666 nfs_argop4 *argop;
1667 nfs_resop4 *resop;
1668 nfs4_ga_res_t *garp;
1669 int num_argops;
1670 lookup4_param_t lookuparg;
1671 nfs_fh4 *tmpfhp;
1672 nfs_fh4 *resfhp;
1673 bool_t needrecov = FALSE;
1674 nfs4_recov_state_t recov_state;
1675 int llndx;
1676 int nthcomp;
1677 int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1679 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1680 ASSERT(svp->sv_path != NULL);
1681 if (svp->sv_path[0] == '\0') {
1682 nfs_rw_exit(&svp->sv_lock);
1683 nfs4_error_init(ep, EINVAL);
1684 return;
1686 nfs_rw_exit(&svp->sv_lock);
1688 recov_state.rs_flags = 0;
1689 recov_state.rs_num_retry_despite_err = 0;
1691 recov_retry:
1692 if (mi->mi_vfs_referral_loop_cnt >= NFS4_REFERRAL_LOOP_MAX) {
1693 DTRACE_PROBE3(nfs4clnt__debug__referral__loop, mntinfo4 *,
1694 mi, servinfo4_t *, svp, char *, "nfs4getfh_otw");
1695 nfs4_error_init(ep, EINVAL);
1696 return;
1698 nfs4_error_zinit(ep);
1700 if (!recovery) {
1701 ep->error = nfs4_start_fop(mi, NULL, NULL, OH_MOUNT,
1702 &recov_state, NULL);
1705 * If recovery has been started and this request as
1706 * initiated by a mount, then we must wait for recovery
1707 * to finish before proceeding, otherwise, the error
1708 * cleanup would remove data structures needed by the
1709 * recovery thread.
1711 if (ep->error) {
1712 mutex_enter(&mi->mi_lock);
1713 if (mi->mi_flags & MI4_MOUNTING) {
1714 mi->mi_flags |= MI4_RECOV_FAIL;
1715 mi->mi_error = EIO;
1717 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1718 "nfs4getfh_otw: waiting 4 recovery\n"));
1720 while (mi->mi_flags & MI4_RECOV_ACTIV)
1721 cv_wait(&mi->mi_failover_cv,
1722 &mi->mi_lock);
1724 mutex_exit(&mi->mi_lock);
1725 return;
1729 * If the client does not specify a specific flavor to use
1730 * and has not gotten a secinfo list from the server yet,
1731 * retrieve the secinfo list from the server and use a
1732 * flavor from the list to mount.
1734 * If fail to get the secinfo list from the server, then
1735 * try the default flavor.
1737 if ((svp->sv_flags & SV4_TRYSECDEFAULT) &&
1738 svp->sv_secinfo == NULL) {
1739 (void) nfs4_secinfo_path(mi, cr, FALSE);
1743 if (recovery)
1744 args.ctag = TAG_REMAP_MOUNT;
1745 else
1746 args.ctag = TAG_MOUNT;
1748 lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1749 lookuparg.argsp = &args;
1750 lookuparg.resp = &res;
1751 lookuparg.header_len = 2; /* Putrootfh, getfh */
1752 lookuparg.trailer_len = 0;
1753 lookuparg.ga_bits = FATTR4_FSINFO_MASK;
1754 lookuparg.mi = mi;
1756 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1757 ASSERT(svp->sv_path != NULL);
1758 llndx = nfs4lookup_setup(svp->sv_path, &lookuparg, 0);
1759 nfs_rw_exit(&svp->sv_lock);
1761 argop = args.array;
1762 num_argops = args.array_len;
1764 /* choose public or root filehandle */
1765 if (flags & NFS4_GETFH_PUBLIC)
1766 argop[0].argop = OP_PUTPUBFH;
1767 else
1768 argop[0].argop = OP_PUTROOTFH;
1770 /* get fh */
1771 argop[1].argop = OP_GETFH;
1773 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1774 "nfs4getfh_otw: %s call, mi 0x%p",
1775 needrecov ? "recov" : "first", (void *)mi));
1777 rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1779 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
1781 if (needrecov) {
1782 bool_t abort;
1784 if (recovery) {
1785 nfs4args_lookup_free(argop, num_argops);
1786 kmem_free(argop,
1787 lookuparg.arglen * sizeof (nfs_argop4));
1788 if (!ep->error)
1789 (void) xdr_free(xdr_COMPOUND4res_clnt,
1790 (caddr_t)&res);
1791 return;
1794 NFS4_DEBUG(nfs4_client_recov_debug,
1795 (CE_NOTE, "nfs4getfh_otw: initiating recovery\n"));
1797 abort = nfs4_start_recovery(ep, mi, NULL,
1798 NULL, NULL, NULL, OP_GETFH, NULL, NULL, NULL);
1799 if (!ep->error) {
1800 ep->error = geterrno4(res.status);
1801 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1803 nfs4args_lookup_free(argop, num_argops);
1804 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1805 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
1806 /* have another go? */
1807 if (abort == FALSE)
1808 goto recov_retry;
1809 return;
1813 * No recovery, but check if error is set.
1815 if (ep->error) {
1816 nfs4args_lookup_free(argop, num_argops);
1817 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1818 if (!recovery)
1819 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1820 needrecov);
1821 return;
1824 is_link_err:
1826 /* for non-recovery errors */
1827 if (res.status && res.status != NFS4ERR_SYMLINK &&
1828 res.status != NFS4ERR_MOVED) {
1829 if (!recovery) {
1830 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1831 needrecov);
1833 nfs4args_lookup_free(argop, num_argops);
1834 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1835 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1836 return;
1840 * If any intermediate component in the path is a symbolic link,
1841 * resolve the symlink, then try mount again using the new path.
1843 if (res.status == NFS4ERR_SYMLINK || res.status == NFS4ERR_MOVED) {
1844 int where;
1847 * Need to call nfs4_end_op before resolve_sympath to avoid
1848 * potential nfs4_start_op deadlock.
1850 if (!recovery)
1851 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1852 needrecov);
1855 * This must be from OP_LOOKUP failure. The (cfh) for this
1856 * OP_LOOKUP is a symlink node. Found out where the
1857 * OP_GETFH is for the (cfh) that is a symlink node.
1859 * Example:
1860 * (mount) PUTROOTFH, GETFH, LOOKUP comp1, GETFH, GETATTR,
1861 * LOOKUP comp2, GETFH, GETATTR, LOOKUP comp3, GETFH, GETATTR
1863 * LOOKUP comp3 fails with SYMLINK because comp2 is a symlink.
1864 * In this case, where = 7, nthcomp = 2.
1866 where = res.array_len - 2;
1867 ASSERT(where > 0);
1869 if (res.status == NFS4ERR_SYMLINK) {
1871 resop = &res.array[where - 1];
1872 ASSERT(resop->resop == OP_GETFH);
1873 tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1874 nthcomp = res.array_len/3 - 1;
1875 ep->error = resolve_sympath(mi, svp, nthcomp,
1876 tmpfhp, cr, flags);
1878 } else if (res.status == NFS4ERR_MOVED) {
1880 resop = &res.array[where - 2];
1881 ASSERT(resop->resop == OP_GETFH);
1882 tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1883 nthcomp = res.array_len/3 - 1;
1884 ep->error = resolve_referral(mi, svp, cr, nthcomp,
1885 tmpfhp);
1888 nfs4args_lookup_free(argop, num_argops);
1889 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1890 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1892 if (ep->error)
1893 return;
1895 goto recov_retry;
1898 /* getfh */
1899 resop = &res.array[res.array_len - 2];
1900 ASSERT(resop->resop == OP_GETFH);
1901 resfhp = &resop->nfs_resop4_u.opgetfh.object;
1903 /* getattr fsinfo res */
1904 resop++;
1905 garp = &resop->nfs_resop4_u.opgetattr.ga_res;
1907 *vtp = garp->n4g_va.va_type;
1909 mi->mi_fh_expire_type = garp->n4g_ext_res->n4g_fet;
1911 mutex_enter(&mi->mi_lock);
1912 if (garp->n4g_ext_res->n4g_pc4.pc4_link_support)
1913 mi->mi_flags |= MI4_LINK;
1914 if (garp->n4g_ext_res->n4g_pc4.pc4_symlink_support)
1915 mi->mi_flags |= MI4_SYMLINK;
1916 if (garp->n4g_ext_res->n4g_suppattrs & FATTR4_ACL_MASK)
1917 mi->mi_flags |= MI4_ACL;
1918 mutex_exit(&mi->mi_lock);
1920 if (garp->n4g_ext_res->n4g_maxread == 0)
1921 mi->mi_tsize =
1922 MIN(MAXBSIZE, mi->mi_tsize);
1923 else
1924 mi->mi_tsize =
1925 MIN(garp->n4g_ext_res->n4g_maxread,
1926 mi->mi_tsize);
1928 if (garp->n4g_ext_res->n4g_maxwrite == 0)
1929 mi->mi_stsize =
1930 MIN(MAXBSIZE, mi->mi_stsize);
1931 else
1932 mi->mi_stsize =
1933 MIN(garp->n4g_ext_res->n4g_maxwrite,
1934 mi->mi_stsize);
1936 if (garp->n4g_ext_res->n4g_maxfilesize != 0)
1937 mi->mi_maxfilesize =
1938 MIN(garp->n4g_ext_res->n4g_maxfilesize,
1939 mi->mi_maxfilesize);
1942 * If the final component is a a symbolic link, resolve the symlink,
1943 * then try mount again using the new path.
1945 * Assume no symbolic link for root filesysm "/".
1947 if (*vtp == VLNK) {
1949 * nthcomp is the total result length minus
1950 * the 1st 2 OPs (PUTROOTFH, GETFH),
1951 * then divided by 3 (LOOKUP,GETFH,GETATTR)
1953 * e.g. PUTROOTFH GETFH LOOKUP 1st-comp GETFH GETATTR
1954 * LOOKUP 2nd-comp GETFH GETATTR
1956 * (8 - 2)/3 = 2
1958 nthcomp = (res.array_len - 2)/3;
1961 * Need to call nfs4_end_op before resolve_sympath to avoid
1962 * potential nfs4_start_op deadlock. See RFE 4777612.
1964 if (!recovery)
1965 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1966 needrecov);
1968 ep->error = resolve_sympath(mi, svp, nthcomp, resfhp, cr,
1969 flags);
1971 nfs4args_lookup_free(argop, num_argops);
1972 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1973 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1975 if (ep->error)
1976 return;
1978 goto recov_retry;
1982 * We need to figure out where in the compound the getfh
1983 * for the parent directory is. If the object to be mounted is
1984 * the root, then there is no lookup at all:
1985 * PUTROOTFH, GETFH.
1986 * If the object to be mounted is in the root, then the compound is:
1987 * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR.
1988 * In either of these cases, the index of the GETFH is 1.
1989 * If it is not at the root, then it's something like:
1990 * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR,
1991 * LOOKUP, GETFH, GETATTR
1992 * In this case, the index is llndx (last lookup index) - 2.
1994 if (llndx == -1 || llndx == 2)
1995 resop = &res.array[1];
1996 else {
1997 ASSERT(llndx > 2);
1998 resop = &res.array[llndx-2];
2001 ASSERT(resop->resop == OP_GETFH);
2002 tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
2004 /* save the filehandles for the replica */
2005 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2006 ASSERT(tmpfhp->nfs_fh4_len <= NFS4_FHSIZE);
2007 svp->sv_pfhandle.fh_len = tmpfhp->nfs_fh4_len;
2008 bcopy(tmpfhp->nfs_fh4_val, svp->sv_pfhandle.fh_buf,
2009 tmpfhp->nfs_fh4_len);
2010 ASSERT(resfhp->nfs_fh4_len <= NFS4_FHSIZE);
2011 svp->sv_fhandle.fh_len = resfhp->nfs_fh4_len;
2012 bcopy(resfhp->nfs_fh4_val, svp->sv_fhandle.fh_buf, resfhp->nfs_fh4_len);
2014 /* initialize fsid and supp_attrs for server fs */
2015 svp->sv_fsid = garp->n4g_fsid;
2016 svp->sv_supp_attrs =
2017 garp->n4g_ext_res->n4g_suppattrs | FATTR4_MANDATTR_MASK;
2019 nfs_rw_exit(&svp->sv_lock);
2020 nfs4args_lookup_free(argop, num_argops);
2021 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
2022 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2023 if (!recovery)
2024 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
2028 * Save a copy of Servinfo4_t structure.
2029 * We might need when there is a failure in getting file handle
2030 * in case of a referral to replace servinfo4 struct and try again.
2032 static struct servinfo4 *
2033 copy_svp(servinfo4_t *nsvp)
2035 servinfo4_t *svp = NULL;
2036 struct knetconfig *sknconf, *tknconf;
2037 struct netbuf *saddr, *taddr;
2039 svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2040 nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2041 svp->sv_flags = nsvp->sv_flags;
2042 svp->sv_fsid = nsvp->sv_fsid;
2043 svp->sv_hostnamelen = nsvp->sv_hostnamelen;
2044 svp->sv_pathlen = nsvp->sv_pathlen;
2045 svp->sv_supp_attrs = nsvp->sv_supp_attrs;
2047 svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
2048 svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
2049 bcopy(nsvp->sv_hostname, svp->sv_hostname, svp->sv_hostnamelen);
2050 bcopy(nsvp->sv_path, svp->sv_path, svp->sv_pathlen);
2052 saddr = &nsvp->sv_addr;
2053 taddr = &svp->sv_addr;
2054 taddr->maxlen = saddr->maxlen;
2055 taddr->len = saddr->len;
2056 if (saddr->len > 0) {
2057 taddr->buf = kmem_zalloc(saddr->maxlen, KM_SLEEP);
2058 bcopy(saddr->buf, taddr->buf, saddr->len);
2061 svp->sv_knconf = kmem_zalloc(sizeof (struct knetconfig), KM_SLEEP);
2062 sknconf = nsvp->sv_knconf;
2063 tknconf = svp->sv_knconf;
2064 tknconf->knc_semantics = sknconf->knc_semantics;
2065 tknconf->knc_rdev = sknconf->knc_rdev;
2066 if (sknconf->knc_proto != NULL) {
2067 tknconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2068 bcopy(sknconf->knc_proto, (char *)tknconf->knc_proto,
2069 KNC_STRSIZE);
2071 if (sknconf->knc_protofmly != NULL) {
2072 tknconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2073 bcopy(sknconf->knc_protofmly, (char *)tknconf->knc_protofmly,
2074 KNC_STRSIZE);
2077 if (nsvp->sv_origknconf != NULL) {
2078 svp->sv_origknconf = kmem_zalloc(sizeof (struct knetconfig),
2079 KM_SLEEP);
2080 sknconf = nsvp->sv_origknconf;
2081 tknconf = svp->sv_origknconf;
2082 tknconf->knc_semantics = sknconf->knc_semantics;
2083 tknconf->knc_rdev = sknconf->knc_rdev;
2084 if (sknconf->knc_proto != NULL) {
2085 tknconf->knc_proto = kmem_zalloc(KNC_STRSIZE, KM_SLEEP);
2086 bcopy(sknconf->knc_proto, (char *)tknconf->knc_proto,
2087 KNC_STRSIZE);
2089 if (sknconf->knc_protofmly != NULL) {
2090 tknconf->knc_protofmly = kmem_zalloc(KNC_STRSIZE,
2091 KM_SLEEP);
2092 bcopy(sknconf->knc_protofmly,
2093 (char *)tknconf->knc_protofmly, KNC_STRSIZE);
2097 svp->sv_secdata = copy_sec_data(nsvp->sv_secdata);
2098 svp->sv_dhsec = copy_sec_data(svp->sv_dhsec);
2100 * Rest of the security information is not copied as they are built
2101 * with the information available from secdata and dhsec.
2103 svp->sv_next = NULL;
2105 return (svp);
2108 servinfo4_t *
2109 restore_svp(mntinfo4_t *mi, servinfo4_t *svp, servinfo4_t *origsvp)
2111 servinfo4_t *srvnext, *tmpsrv;
2113 if (strcmp(svp->sv_hostname, origsvp->sv_hostname) != 0) {
2115 * Since the hostname changed, we must be dealing
2116 * with a referral, and the lookup failed. We will
2117 * restore the whole servinfo4_t to what it was before.
2119 srvnext = svp->sv_next;
2120 svp->sv_next = NULL;
2121 tmpsrv = copy_svp(origsvp);
2122 sv4_free(svp);
2123 svp = tmpsrv;
2124 svp->sv_next = srvnext;
2125 mutex_enter(&mi->mi_lock);
2126 mi->mi_servers = svp;
2127 mi->mi_curr_serv = svp;
2128 mutex_exit(&mi->mi_lock);
2130 } else if (origsvp->sv_pathlen != svp->sv_pathlen) {
2133 * For symlink case: restore original path because
2134 * it might have contained symlinks that were
2135 * expanded by nfsgetfh_otw before the failure occurred.
2137 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2138 kmem_free(svp->sv_path, svp->sv_pathlen);
2139 svp->sv_path =
2140 kmem_alloc(origsvp->sv_pathlen, KM_SLEEP);
2141 svp->sv_pathlen = origsvp->sv_pathlen;
2142 bcopy(origsvp->sv_path, svp->sv_path,
2143 origsvp->sv_pathlen);
2144 nfs_rw_exit(&svp->sv_lock);
2146 return (svp);
2149 static ushort_t nfs4_max_threads = 8; /* max number of active async threads */
2150 uint_t nfs4_bsize = 32 * 1024; /* client `block' size */
2151 static uint_t nfs4_async_clusters = 1; /* # of reqs from each async queue */
2152 static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO;
2155 * Remap the root filehandle for the given filesystem.
2157 * results returned via the nfs4_error_t parameter.
2159 void
2160 nfs4_remap_root(mntinfo4_t *mi, nfs4_error_t *ep, int flags)
2162 struct servinfo4 *svp, *origsvp;
2163 vtype_t vtype;
2164 nfs_fh4 rootfh;
2165 int getfh_flags;
2166 int num_retry;
2168 mutex_enter(&mi->mi_lock);
2170 remap_retry:
2171 svp = mi->mi_curr_serv;
2172 getfh_flags =
2173 (flags & NFS4_REMAP_NEEDSOP) ? NFS4_GETFH_NEEDSOP : 0;
2174 getfh_flags |=
2175 (mi->mi_flags & MI4_PUBLIC) ? NFS4_GETFH_PUBLIC : 0;
2176 mutex_exit(&mi->mi_lock);
2179 * Just in case server path being mounted contains
2180 * symlinks and fails w/STALE, save the initial sv_path
2181 * so we can redrive the initial mount compound with the
2182 * initial sv_path -- not a symlink-expanded version.
2184 * This could only happen if a symlink was expanded
2185 * and the expanded mount compound failed stale. Because
2186 * it could be the case that the symlink was removed at
2187 * the server (and replaced with another symlink/dir,
2188 * we need to use the initial sv_path when attempting
2189 * to re-lookup everything and recover.
2191 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2192 origsvp = copy_svp(svp);
2193 nfs_rw_exit(&svp->sv_lock);
2195 num_retry = nfs4_max_mount_retry;
2197 do {
2199 * Get the root fh from the server. Retry nfs4_max_mount_retry
2200 * (2) times if it fails with STALE since the recovery
2201 * infrastructure doesn't do STALE recovery for components
2202 * of the server path to the object being mounted.
2204 nfs4getfh_otw(mi, svp, &vtype, getfh_flags, CRED(), ep);
2206 if (ep->error == 0 && ep->stat == NFS4_OK)
2207 break;
2210 * For some reason, the mount compound failed. Before
2211 * retrying, we need to restore original conditions.
2213 svp = restore_svp(mi, svp, origsvp);
2215 } while (num_retry-- > 0);
2217 sv4_free(origsvp);
2219 if (ep->error != 0 || ep->stat != 0) {
2220 return;
2223 if (vtype != VNON && vtype != mi->mi_type) {
2224 /* shouldn't happen */
2225 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2226 "nfs4_remap_root: server root vnode type (%d) doesn't "
2227 "match mount info (%d)", vtype, mi->mi_type);
2230 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2231 rootfh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2232 rootfh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2233 nfs_rw_exit(&svp->sv_lock);
2234 sfh4_update(mi->mi_rootfh, &rootfh);
2237 * It's possible that recovery took place on the filesystem
2238 * and the server has been updated between the time we did
2239 * the nfs4getfh_otw and now. Re-drive the otw operation
2240 * to make sure we have a good fh.
2242 mutex_enter(&mi->mi_lock);
2243 if (mi->mi_curr_serv != svp)
2244 goto remap_retry;
2246 mutex_exit(&mi->mi_lock);
2249 static int
2250 nfs4rootvp(vnode_t **rtvpp, vfs_t *vfsp, struct servinfo4 *svp_head,
2251 int flags, cred_t *cr, zone_t *zone)
2253 vnode_t *rtvp = NULL;
2254 mntinfo4_t *mi;
2255 dev_t nfs_dev;
2256 int error = 0;
2257 rnode4_t *rp;
2258 int i, len;
2259 struct vattr va;
2260 vtype_t vtype = VNON;
2261 vtype_t tmp_vtype = VNON;
2262 struct servinfo4 *firstsvp = NULL, *svp = svp_head;
2263 nfs4_oo_hash_bucket_t *bucketp;
2264 nfs_fh4 fh;
2265 char *droptext = "";
2266 struct nfs_stats *nfsstatsp;
2267 nfs4_fname_t *mfname;
2268 nfs4_error_t e;
2269 int num_retry, removed;
2270 cred_t *lcr = NULL, *tcr = cr;
2271 struct servinfo4 *origsvp;
2272 char *resource;
2274 nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
2275 ASSERT(nfsstatsp != NULL);
2277 ASSERT(nfs_zone() == zone);
2278 ASSERT(crgetref(cr));
2281 * Create a mount record and link it to the vfs struct.
2283 mi = kmem_zalloc(sizeof (*mi), KM_SLEEP);
2284 mutex_init(&mi->mi_lock, NULL, MUTEX_DEFAULT, NULL);
2285 nfs_rw_init(&mi->mi_recovlock, NULL, RW_DEFAULT, NULL);
2286 nfs_rw_init(&mi->mi_rename_lock, NULL, RW_DEFAULT, NULL);
2287 nfs_rw_init(&mi->mi_fh_lock, NULL, RW_DEFAULT, NULL);
2289 if (!(flags & NFSMNT_SOFT))
2290 mi->mi_flags |= MI4_HARD;
2291 if ((flags & NFSMNT_NOPRINT))
2292 mi->mi_flags |= MI4_NOPRINT;
2293 if (flags & NFSMNT_INT)
2294 mi->mi_flags |= MI4_INT;
2295 if (flags & NFSMNT_PUBLIC)
2296 mi->mi_flags |= MI4_PUBLIC;
2297 if (flags & NFSMNT_MIRRORMOUNT)
2298 mi->mi_flags |= MI4_MIRRORMOUNT;
2299 if (flags & NFSMNT_REFERRAL)
2300 mi->mi_flags |= MI4_REFERRAL;
2301 mi->mi_retrans = NFS_RETRIES;
2302 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
2303 svp->sv_knconf->knc_semantics == NC_TPI_COTS)
2304 mi->mi_timeo = nfs4_cots_timeo;
2305 else
2306 mi->mi_timeo = NFS_TIMEO;
2307 mi->mi_prog = NFS_PROGRAM;
2308 mi->mi_vers = NFS_V4;
2309 mi->mi_rfsnames = rfsnames_v4;
2310 mi->mi_reqs = nfsstatsp->nfs_stats_v4.rfsreqcnt_ptr;
2311 cv_init(&mi->mi_failover_cv, NULL, CV_DEFAULT, NULL);
2312 mi->mi_servers = svp;
2313 mi->mi_curr_serv = svp;
2314 mi->mi_acregmin = SEC2HR(ACREGMIN);
2315 mi->mi_acregmax = SEC2HR(ACREGMAX);
2316 mi->mi_acdirmin = SEC2HR(ACDIRMIN);
2317 mi->mi_acdirmax = SEC2HR(ACDIRMAX);
2318 mi->mi_fh_expire_type = FH4_PERSISTENT;
2319 mi->mi_clientid_next = NULL;
2320 mi->mi_clientid_prev = NULL;
2321 mi->mi_srv = NULL;
2322 mi->mi_grace_wait = 0;
2323 mi->mi_error = 0;
2324 mi->mi_srvsettime = 0;
2325 mi->mi_srvset_cnt = 0;
2327 mi->mi_count = 1;
2329 mi->mi_tsize = nfs4_tsize(svp->sv_knconf);
2330 mi->mi_stsize = mi->mi_tsize;
2332 if (flags & NFSMNT_DIRECTIO)
2333 mi->mi_flags |= MI4_DIRECTIO;
2335 mi->mi_flags |= MI4_MOUNTING;
2338 * Make a vfs struct for nfs. We do this here instead of below
2339 * because rtvp needs a vfs before we can do a getattr on it.
2341 * Assign a unique device id to the mount
2343 mutex_enter(&nfs_minor_lock);
2344 do {
2345 nfs_minor = (nfs_minor + 1) & MAXMIN32;
2346 nfs_dev = makedevice(nfs_major, nfs_minor);
2347 } while (vfs_devismounted(nfs_dev));
2348 mutex_exit(&nfs_minor_lock);
2350 vfsp->vfs_dev = nfs_dev;
2351 vfs_make_fsid(&vfsp->vfs_fsid, nfs_dev, nfs4fstyp);
2352 vfsp->vfs_data = (caddr_t)mi;
2353 vfsp->vfs_fstype = nfsfstyp;
2354 vfsp->vfs_bsize = nfs4_bsize;
2357 * Initialize fields used to support async putpage operations.
2359 for (i = 0; i < NFS4_ASYNC_TYPES; i++)
2360 mi->mi_async_clusters[i] = nfs4_async_clusters;
2361 mi->mi_async_init_clusters = nfs4_async_clusters;
2362 mi->mi_async_curr[NFS4_ASYNC_QUEUE] =
2363 mi->mi_async_curr[NFS4_ASYNC_PGOPS_QUEUE] = &mi->mi_async_reqs[0];
2364 mi->mi_max_threads = nfs4_max_threads;
2365 mutex_init(&mi->mi_async_lock, NULL, MUTEX_DEFAULT, NULL);
2366 cv_init(&mi->mi_async_reqs_cv, NULL, CV_DEFAULT, NULL);
2367 cv_init(&mi->mi_async_work_cv[NFS4_ASYNC_QUEUE], NULL, CV_DEFAULT,
2368 NULL);
2369 cv_init(&mi->mi_async_work_cv[NFS4_ASYNC_PGOPS_QUEUE], NULL,
2370 CV_DEFAULT, NULL);
2371 cv_init(&mi->mi_async_cv, NULL, CV_DEFAULT, NULL);
2372 cv_init(&mi->mi_inact_req_cv, NULL, CV_DEFAULT, NULL);
2374 mi->mi_vfsp = vfsp;
2375 mi->mi_zone = zone;
2376 zone_init_ref(&mi->mi_zone_ref);
2377 zone_hold_ref(zone, &mi->mi_zone_ref, ZONE_REF_NFSV4);
2378 nfs4_mi_zonelist_add(mi);
2381 * Initialize the <open owner/cred> hash table.
2383 for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
2384 bucketp = &(mi->mi_oo_list[i]);
2385 mutex_init(&bucketp->b_lock, NULL, MUTEX_DEFAULT, NULL);
2386 list_create(&bucketp->b_oo_hash_list,
2387 sizeof (nfs4_open_owner_t),
2388 offsetof(nfs4_open_owner_t, oo_hash_node));
2392 * Initialize the freed open owner list.
2394 mi->mi_foo_num = 0;
2395 mi->mi_foo_max = NFS4_NUM_FREED_OPEN_OWNERS;
2396 list_create(&mi->mi_foo_list, sizeof (nfs4_open_owner_t),
2397 offsetof(nfs4_open_owner_t, oo_foo_node));
2399 list_create(&mi->mi_lost_state, sizeof (nfs4_lost_rqst_t),
2400 offsetof(nfs4_lost_rqst_t, lr_node));
2402 list_create(&mi->mi_bseqid_list, sizeof (nfs4_bseqid_entry_t),
2403 offsetof(nfs4_bseqid_entry_t, bs_node));
2406 * Initialize the msg buffer.
2408 list_create(&mi->mi_msg_list, sizeof (nfs4_debug_msg_t),
2409 offsetof(nfs4_debug_msg_t, msg_node));
2410 mi->mi_msg_count = 0;
2411 mutex_init(&mi->mi_msg_list_lock, NULL, MUTEX_DEFAULT, NULL);
2414 * Initialize kstats
2416 nfs4_mnt_kstat_init(vfsp);
2419 * Initialize the shared filehandle pool.
2421 sfh4_createtab(&mi->mi_filehandles);
2424 * Save server path we're attempting to mount.
2426 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2427 origsvp = copy_svp(svp);
2428 nfs_rw_exit(&svp->sv_lock);
2431 * Make the GETFH call to get root fh for each replica.
2433 if (svp_head->sv_next)
2434 droptext = ", dropping replica";
2437 * If the uid is set then set the creds for secure mounts
2438 * by proxy processes such as automountd.
2440 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2441 if (svp->sv_secdata->uid != 0 &&
2442 svp->sv_secdata->rpcflavor == RPCSEC_GSS) {
2443 lcr = crdup(cr);
2444 (void) crsetugid(lcr, svp->sv_secdata->uid, crgetgid(cr));
2445 tcr = lcr;
2447 nfs_rw_exit(&svp->sv_lock);
2448 for (svp = svp_head; svp; svp = svp->sv_next) {
2449 if (nfs4_chkdup_servinfo4(svp_head, svp)) {
2450 nfs_cmn_err(error, CE_WARN,
2451 VERS_MSG "Host %s is a duplicate%s",
2452 svp->sv_hostname, droptext);
2453 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2454 svp->sv_flags |= SV4_NOTINUSE;
2455 nfs_rw_exit(&svp->sv_lock);
2456 continue;
2458 mi->mi_curr_serv = svp;
2461 * Just in case server path being mounted contains
2462 * symlinks and fails w/STALE, save the initial sv_path
2463 * so we can redrive the initial mount compound with the
2464 * initial sv_path -- not a symlink-expanded version.
2466 * This could only happen if a symlink was expanded
2467 * and the expanded mount compound failed stale. Because
2468 * it could be the case that the symlink was removed at
2469 * the server (and replaced with another symlink/dir,
2470 * we need to use the initial sv_path when attempting
2471 * to re-lookup everything and recover.
2473 * Other mount errors should evenutally be handled here also
2474 * (NFS4ERR_DELAY, NFS4ERR_RESOURCE). For now, all mount
2475 * failures will result in mount being redriven a few times.
2477 num_retry = nfs4_max_mount_retry;
2478 do {
2479 nfs4getfh_otw(mi, svp, &tmp_vtype,
2480 ((flags & NFSMNT_PUBLIC) ? NFS4_GETFH_PUBLIC : 0) |
2481 NFS4_GETFH_NEEDSOP, tcr, &e);
2483 if (e.error == 0 && e.stat == NFS4_OK)
2484 break;
2487 * For some reason, the mount compound failed. Before
2488 * retrying, we need to restore original conditions.
2490 svp = restore_svp(mi, svp, origsvp);
2491 svp_head = svp;
2493 } while (num_retry-- > 0);
2494 error = e.error ? e.error : geterrno4(e.stat);
2495 if (error) {
2496 nfs_cmn_err(error, CE_WARN,
2497 VERS_MSG "initial call to %s failed%s: %m",
2498 svp->sv_hostname, droptext);
2499 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2500 svp->sv_flags |= SV4_NOTINUSE;
2501 nfs_rw_exit(&svp->sv_lock);
2502 mi->mi_flags &= ~MI4_RECOV_FAIL;
2503 mi->mi_error = 0;
2504 continue;
2507 if (tmp_vtype == VBAD) {
2508 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2509 VERS_MSG "%s returned a bad file type for "
2510 "root%s", svp->sv_hostname, droptext);
2511 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2512 svp->sv_flags |= SV4_NOTINUSE;
2513 nfs_rw_exit(&svp->sv_lock);
2514 continue;
2517 if (vtype == VNON) {
2518 vtype = tmp_vtype;
2519 } else if (vtype != tmp_vtype) {
2520 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2521 VERS_MSG "%s returned a different file type "
2522 "for root%s", svp->sv_hostname, droptext);
2523 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2524 svp->sv_flags |= SV4_NOTINUSE;
2525 nfs_rw_exit(&svp->sv_lock);
2526 continue;
2528 if (firstsvp == NULL)
2529 firstsvp = svp;
2532 if (firstsvp == NULL) {
2533 if (error == 0)
2534 error = ENOENT;
2535 goto bad;
2538 mi->mi_curr_serv = svp = firstsvp;
2539 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2540 ASSERT((mi->mi_curr_serv->sv_flags & SV4_NOTINUSE) == 0);
2541 fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2542 fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2543 mi->mi_rootfh = sfh4_get(&fh, mi);
2544 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
2545 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
2546 mi->mi_srvparentfh = sfh4_get(&fh, mi);
2547 nfs_rw_exit(&svp->sv_lock);
2550 * Get the fname for filesystem root.
2552 mi->mi_fname = fn_get(NULL, ".", mi->mi_rootfh);
2553 mfname = mi->mi_fname;
2554 fn_hold(mfname);
2557 * Make the root vnode without attributes.
2559 rtvp = makenfs4node_by_fh(mi->mi_rootfh, NULL,
2560 &mfname, NULL, mi, cr, gethrtime());
2561 rtvp->v_type = vtype;
2563 mi->mi_curread = mi->mi_tsize;
2564 mi->mi_curwrite = mi->mi_stsize;
2567 * Start the manager thread responsible for handling async worker
2568 * threads.
2570 MI4_HOLD(mi);
2571 VFS_HOLD(vfsp); /* add reference for thread */
2572 mi->mi_manager_thread = zthread_create(NULL, 0, nfs4_async_manager,
2573 vfsp, 0, minclsyspri);
2574 ASSERT(mi->mi_manager_thread != NULL);
2577 * Create the thread that handles over-the-wire calls for
2578 * fop_inactive.
2579 * This needs to happen after the manager thread is created.
2581 MI4_HOLD(mi);
2582 mi->mi_inactive_thread = zthread_create(NULL, 0, nfs4_inactive_thread,
2583 mi, 0, minclsyspri);
2584 ASSERT(mi->mi_inactive_thread != NULL);
2586 /* If we didn't get a type, get one now */
2587 if (rtvp->v_type == VNON) {
2588 va.va_mask = AT_TYPE;
2589 error = nfs4getattr(rtvp, &va, tcr);
2590 if (error)
2591 goto bad;
2592 rtvp->v_type = va.va_type;
2595 mi->mi_type = rtvp->v_type;
2597 mutex_enter(&mi->mi_lock);
2598 mi->mi_flags &= ~MI4_MOUNTING;
2599 mutex_exit(&mi->mi_lock);
2601 /* Update VFS with new server and path info */
2602 if ((strcmp(svp->sv_hostname, origsvp->sv_hostname) != 0) ||
2603 (strcmp(svp->sv_path, origsvp->sv_path) != 0)) {
2604 len = svp->sv_hostnamelen + svp->sv_pathlen;
2605 resource = kmem_zalloc(len, KM_SLEEP);
2606 (void) strcat(resource, svp->sv_hostname);
2607 (void) strcat(resource, ":");
2608 (void) strcat(resource, svp->sv_path);
2609 vfs_setresource(vfsp, resource, 0);
2610 kmem_free(resource, len);
2613 sv4_free(origsvp);
2614 *rtvpp = rtvp;
2615 if (lcr != NULL)
2616 crfree(lcr);
2618 return (0);
2619 bad:
2621 * An error occurred somewhere, need to clean up...
2623 if (lcr != NULL)
2624 crfree(lcr);
2626 if (rtvp != NULL) {
2628 * We need to release our reference to the root vnode and
2629 * destroy the mntinfo4 struct that we just created.
2631 rp = VTOR4(rtvp);
2632 if (rp->r_flags & R4HASHED)
2633 rp4_rmhash(rp);
2634 VN_RELE(rtvp);
2636 nfs4_async_stop(vfsp);
2637 nfs4_async_manager_stop(vfsp);
2638 removed = nfs4_mi_zonelist_remove(mi);
2639 if (removed)
2640 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2643 * This releases the initial "hold" of the mi since it will never
2644 * be referenced by the vfsp. Also, when mount returns to vfs.c
2645 * with an error, the vfsp will be destroyed, not rele'd.
2647 MI4_RELE(mi);
2649 if (origsvp != NULL)
2650 sv4_free(origsvp);
2652 *rtvpp = NULL;
2653 return (error);
2657 * vfs operations
2659 static int
2660 nfs4_unmount(vfs_t *vfsp, int flag, cred_t *cr)
2662 mntinfo4_t *mi;
2663 ushort_t omax;
2664 int removed;
2666 bool_t must_unlock;
2668 nfs4_ephemeral_tree_t *eph_tree;
2670 if (secpolicy_fs_unmount(cr, vfsp) != 0)
2671 return (EPERM);
2673 mi = VFTOMI4(vfsp);
2675 if (flag & MS_FORCE) {
2676 vfsp->vfs_flag |= VFS_UNMOUNTED;
2677 if (nfs_zone() != mi->mi_zone) {
2679 * If the request is coming from the wrong zone,
2680 * we don't want to create any new threads, and
2681 * performance is not a concern. Do everything
2682 * inline.
2684 NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2685 "nfs4_unmount x-zone forced unmount of vfs %p\n",
2686 (void *)vfsp));
2687 nfs4_free_mount(vfsp, flag, cr);
2688 } else {
2690 * Free data structures asynchronously, to avoid
2691 * blocking the current thread (for performance
2692 * reasons only).
2694 async_free_mount(vfsp, flag, cr);
2697 return (0);
2701 * Wait until all asynchronous putpage operations on
2702 * this file system are complete before flushing rnodes
2703 * from the cache.
2705 omax = mi->mi_max_threads;
2706 if (nfs4_async_stop_sig(vfsp))
2707 return (EINTR);
2709 r4flush(vfsp, cr);
2712 * About the only reason that this would fail would be
2713 * that the harvester is already busy tearing down this
2714 * node. So we fail back to the caller and let them try
2715 * again when needed.
2717 if (nfs4_ephemeral_umount(mi, flag, cr,
2718 &must_unlock, &eph_tree)) {
2719 ASSERT(must_unlock == FALSE);
2720 mutex_enter(&mi->mi_async_lock);
2721 mi->mi_max_threads = omax;
2722 mutex_exit(&mi->mi_async_lock);
2724 return (EBUSY);
2728 * If there are any active vnodes on this file system,
2729 * then the file system is busy and can't be unmounted.
2731 if (check_rtable4(vfsp)) {
2732 nfs4_ephemeral_umount_unlock(&must_unlock, &eph_tree);
2734 mutex_enter(&mi->mi_async_lock);
2735 mi->mi_max_threads = omax;
2736 mutex_exit(&mi->mi_async_lock);
2738 return (EBUSY);
2742 * The unmount can't fail from now on, so record any
2743 * ephemeral changes.
2745 nfs4_ephemeral_umount_activate(mi, &must_unlock, &eph_tree);
2748 * There are no active files that could require over-the-wire
2749 * calls to the server, so stop the async manager and the
2750 * inactive thread.
2752 nfs4_async_manager_stop(vfsp);
2755 * Destroy all rnodes belonging to this file system from the
2756 * rnode hash queues and purge any resources allocated to
2757 * them.
2759 destroy_rtable4(vfsp, cr);
2760 vfsp->vfs_flag |= VFS_UNMOUNTED;
2762 nfs4_remove_mi_from_server(mi, NULL);
2763 removed = nfs4_mi_zonelist_remove(mi);
2764 if (removed)
2765 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
2767 return (0);
2771 * find root of nfs
2773 static int
2774 nfs4_root(vfs_t *vfsp, vnode_t **vpp)
2776 mntinfo4_t *mi;
2777 vnode_t *vp;
2778 nfs4_fname_t *mfname;
2779 servinfo4_t *svp;
2781 mi = VFTOMI4(vfsp);
2783 if (nfs_zone() != mi->mi_zone)
2784 return (EPERM);
2786 svp = mi->mi_curr_serv;
2787 if (svp) {
2788 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2789 if (svp->sv_flags & SV4_ROOT_STALE) {
2790 nfs_rw_exit(&svp->sv_lock);
2792 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2793 if (svp->sv_flags & SV4_ROOT_STALE) {
2794 svp->sv_flags &= ~SV4_ROOT_STALE;
2795 nfs_rw_exit(&svp->sv_lock);
2796 return (ENOENT);
2798 nfs_rw_exit(&svp->sv_lock);
2799 } else
2800 nfs_rw_exit(&svp->sv_lock);
2803 mfname = mi->mi_fname;
2804 fn_hold(mfname);
2805 vp = makenfs4node_by_fh(mi->mi_rootfh, NULL, &mfname, NULL,
2806 VFTOMI4(vfsp), CRED(), gethrtime());
2808 if (VTOR4(vp)->r_flags & R4STALE) {
2809 VN_RELE(vp);
2810 return (ENOENT);
2813 ASSERT(vp->v_type == VNON || vp->v_type == mi->mi_type);
2815 vp->v_type = mi->mi_type;
2817 *vpp = vp;
2819 return (0);
2822 static int
2823 nfs4_statfs_otw(vnode_t *vp, struct statvfs64 *sbp, cred_t *cr)
2825 int error;
2826 nfs4_ga_res_t gar;
2827 nfs4_ga_ext_res_t ger;
2829 gar.n4g_ext_res = &ger;
2831 if (error = nfs4_attr_otw(vp, TAG_FSINFO, &gar,
2832 NFS4_STATFS_ATTR_MASK, cr))
2833 return (error);
2835 *sbp = gar.n4g_ext_res->n4g_sb;
2837 return (0);
2841 * Get file system statistics.
2843 static int
2844 nfs4_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
2846 int error;
2847 vnode_t *vp;
2848 cred_t *cr;
2850 error = nfs4_root(vfsp, &vp);
2851 if (error)
2852 return (error);
2854 cr = CRED();
2856 error = nfs4_statfs_otw(vp, sbp, cr);
2857 if (!error) {
2858 (void) strncpy(sbp->f_basetype,
2859 vfssw[vfsp->vfs_fstype].vsw_name, FSTYPSZ);
2860 sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
2861 } else {
2862 nfs4_purge_stale_fh(error, vp, cr);
2865 VN_RELE(vp);
2867 return (error);
2870 static kmutex_t nfs4_syncbusy;
2873 * Flush dirty nfs files for file system vfsp.
2874 * If vfsp == NULL, all nfs files are flushed.
2876 * SYNC_CLOSE in flag is passed to us to
2877 * indicate that we are shutting down and or
2878 * rebooting.
2880 static int
2881 nfs4_sync(vfs_t *vfsp, short flag, cred_t *cr)
2884 * Cross-zone calls are OK here, since this translates to a
2885 * fop_putpage(B_ASYNC), which gets picked up by the right zone.
2887 if (!(flag & SYNC_ATTR) && mutex_tryenter(&nfs4_syncbusy) != 0) {
2888 r4flush(vfsp, cr);
2889 mutex_exit(&nfs4_syncbusy);
2893 * if SYNC_CLOSE is set then we know that
2894 * the system is rebooting, mark the mntinfo
2895 * for later examination.
2897 if (vfsp && (flag & SYNC_CLOSE)) {
2898 mntinfo4_t *mi;
2900 mi = VFTOMI4(vfsp);
2901 if (!(mi->mi_flags & MI4_SHUTDOWN)) {
2902 mutex_enter(&mi->mi_lock);
2903 mi->mi_flags |= MI4_SHUTDOWN;
2904 mutex_exit(&mi->mi_lock);
2907 return (0);
2911 * vget is difficult, if not impossible, to support in v4 because we don't
2912 * know the parent directory or name, which makes it impossible to create a
2913 * useful shadow vnode. And we need the shadow vnode for things like
2914 * OPEN.
2917 /* ARGSUSED */
2919 * XXX Check nfs4_vget_pseudo() for dependency.
2921 static int
2922 nfs4_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
2924 return (EREMOTE);
2928 * nfs4_mountroot get called in the case where we are diskless booting. All
2929 * we need from here is the ability to get the server info and from there we
2930 * can simply call nfs4_rootvp.
2932 /* ARGSUSED */
2933 static int
2934 nfs4_mountroot(vfs_t *vfsp, whymountroot_t why)
2936 vnode_t *rtvp;
2937 char root_hostname[SYS_NMLN+1];
2938 struct servinfo4 *svp;
2939 int error;
2940 int vfsflags;
2941 size_t size;
2942 char *root_path;
2943 struct pathname pn;
2944 char *name;
2945 cred_t *cr;
2946 mntinfo4_t *mi;
2947 struct nfs_args args; /* nfs mount arguments */
2948 static char token[10];
2949 nfs4_error_t n4e;
2951 bzero(&args, sizeof (args));
2953 /* do this BEFORE getfile which causes xid stamps to be initialized */
2954 clkset(-1L); /* hack for now - until we get time svc? */
2956 if (why == ROOT_REMOUNT) {
2958 * Shouldn't happen.
2960 panic("nfs4_mountroot: why == ROOT_REMOUNT");
2963 if (why == ROOT_UNMOUNT) {
2965 * Nothing to do for NFS.
2967 return (0);
2971 * why == ROOT_INIT
2974 name = token;
2975 *name = 0;
2976 (void) getfsname("root", name, sizeof (token));
2978 pn_alloc(&pn);
2979 root_path = pn.pn_path;
2981 svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2982 nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2983 svp->sv_knconf = kmem_zalloc(sizeof (*svp->sv_knconf), KM_SLEEP);
2984 svp->sv_knconf->knc_protofmly = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2985 svp->sv_knconf->knc_proto = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2988 * Get server address
2989 * Get the root path
2990 * Get server's transport
2991 * Get server's hostname
2992 * Get options
2994 args.addr = &svp->sv_addr;
2995 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2996 args.fh = (char *)&svp->sv_fhandle;
2997 args.knconf = svp->sv_knconf;
2998 args.hostname = root_hostname;
2999 vfsflags = 0;
3000 if (error = mount_root(*name ? name : "root", root_path, NFS_V4,
3001 &args, &vfsflags)) {
3002 if (error == EPROTONOSUPPORT)
3003 nfs_cmn_err(error, CE_WARN, "nfs4_mountroot: "
3004 "mount_root failed: server doesn't support NFS V4");
3005 else
3006 nfs_cmn_err(error, CE_WARN,
3007 "nfs4_mountroot: mount_root failed: %m");
3008 nfs_rw_exit(&svp->sv_lock);
3009 sv4_free(svp);
3010 pn_free(&pn);
3011 return (error);
3013 nfs_rw_exit(&svp->sv_lock);
3014 svp->sv_hostnamelen = (int)(strlen(root_hostname) + 1);
3015 svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
3016 (void) strcpy(svp->sv_hostname, root_hostname);
3018 svp->sv_pathlen = (int)(strlen(root_path) + 1);
3019 svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
3020 (void) strcpy(svp->sv_path, root_path);
3023 * Force root partition to always be mounted with AUTH_UNIX for now
3025 svp->sv_secdata = kmem_alloc(sizeof (*svp->sv_secdata), KM_SLEEP);
3026 svp->sv_secdata->secmod = AUTH_UNIX;
3027 svp->sv_secdata->rpcflavor = AUTH_UNIX;
3028 svp->sv_secdata->data = NULL;
3030 cr = crgetcred();
3031 rtvp = NULL;
3033 error = nfs4rootvp(&rtvp, vfsp, svp, args.flags, cr, global_zone);
3035 if (error) {
3036 crfree(cr);
3037 pn_free(&pn);
3038 sv4_free(svp);
3039 return (error);
3042 mi = VTOMI4(rtvp);
3045 * Send client id to the server, if necessary
3047 nfs4_error_zinit(&n4e);
3048 nfs4setclientid(mi, cr, FALSE, &n4e);
3049 error = n4e.error;
3051 crfree(cr);
3053 if (error) {
3054 pn_free(&pn);
3055 goto errout;
3058 error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, &args);
3059 if (error) {
3060 nfs_cmn_err(error, CE_WARN,
3061 "nfs4_mountroot: invalid root mount options");
3062 pn_free(&pn);
3063 goto errout;
3066 (void) vfs_lock_wait(vfsp);
3067 vfs_add(NULL, vfsp, vfsflags);
3068 vfs_unlock(vfsp);
3070 size = strlen(svp->sv_hostname);
3071 (void) strcpy(rootfs.bo_name, svp->sv_hostname);
3072 rootfs.bo_name[size] = ':';
3073 (void) strcpy(&rootfs.bo_name[size + 1], root_path);
3075 pn_free(&pn);
3077 errout:
3078 if (error) {
3079 sv4_free(svp);
3080 nfs4_async_stop(vfsp);
3081 nfs4_async_manager_stop(vfsp);
3084 if (rtvp != NULL)
3085 VN_RELE(rtvp);
3087 return (error);
3091 * Initialization routine for VFS routines. Should only be called once
3094 nfs4_vfsinit(void)
3096 mutex_init(&nfs4_syncbusy, NULL, MUTEX_DEFAULT, NULL);
3097 nfs4setclientid_init();
3098 nfs4_ephemeral_init();
3099 return (0);
3102 void
3103 nfs4_vfsfini(void)
3105 nfs4_ephemeral_fini();
3106 nfs4setclientid_fini();
3107 mutex_destroy(&nfs4_syncbusy);
3110 void
3111 nfs4_freevfs(vfs_t *vfsp)
3113 mntinfo4_t *mi;
3115 /* need to release the initial hold */
3116 mi = VFTOMI4(vfsp);
3119 * At this point, we can no longer reference the vfs
3120 * and need to inform other holders of the reference
3121 * to the mntinfo4_t.
3123 mi->mi_vfsp = NULL;
3125 MI4_RELE(mi);
3129 * Client side SETCLIENTID and SETCLIENTID_CONFIRM
3131 struct nfs4_server nfs4_server_lst =
3132 { &nfs4_server_lst, &nfs4_server_lst };
3134 kmutex_t nfs4_server_lst_lock;
3136 static void
3137 nfs4setclientid_init(void)
3139 mutex_init(&nfs4_server_lst_lock, NULL, MUTEX_DEFAULT, NULL);
3142 static void
3143 nfs4setclientid_fini(void)
3145 mutex_destroy(&nfs4_server_lst_lock);
3148 int nfs4_retry_sclid_delay = NFS4_RETRY_SCLID_DELAY;
3149 int nfs4_num_sclid_retries = NFS4_NUM_SCLID_RETRIES;
3152 * Set the clientid for the server for "mi". No-op if the clientid is
3153 * already set.
3155 * The recovery boolean should be set to TRUE if this function was called
3156 * by the recovery code, and FALSE otherwise. This is used to determine
3157 * if we need to call nfs4_start/end_op as well as grab the mi_recovlock
3158 * for adding a mntinfo4_t to a nfs4_server_t.
3160 * Error is returned via 'n4ep'. If there was a 'n4ep->stat' error, then
3161 * 'n4ep->error' is set to geterrno4(n4ep->stat).
3163 void
3164 nfs4setclientid(mntinfo4_t *mi, cred_t *cr, bool_t recovery, nfs4_error_t *n4ep)
3166 struct nfs4_server *np;
3167 struct servinfo4 *svp = mi->mi_curr_serv;
3168 nfs4_recov_state_t recov_state;
3169 int num_retries = 0;
3170 bool_t retry;
3171 cred_t *lcr = NULL;
3172 int retry_inuse = 1; /* only retry once on NFS4ERR_CLID_INUSE */
3173 time_t lease_time = 0;
3175 recov_state.rs_flags = 0;
3176 recov_state.rs_num_retry_despite_err = 0;
3177 ASSERT(n4ep != NULL);
3179 recov_retry:
3180 retry = FALSE;
3181 nfs4_error_zinit(n4ep);
3182 if (!recovery)
3183 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3185 mutex_enter(&nfs4_server_lst_lock);
3186 np = servinfo4_to_nfs4_server(svp); /* This locks np if it is found */
3187 mutex_exit(&nfs4_server_lst_lock);
3188 if (!np) {
3189 struct nfs4_server *tnp;
3190 np = new_nfs4_server(svp, cr);
3191 mutex_enter(&np->s_lock);
3193 mutex_enter(&nfs4_server_lst_lock);
3194 tnp = servinfo4_to_nfs4_server(svp);
3195 if (tnp) {
3197 * another thread snuck in and put server on list.
3198 * since we aren't adding it to the nfs4_server_list
3199 * we need to set the ref count to 0 and destroy it.
3201 np->s_refcnt = 0;
3202 destroy_nfs4_server(np);
3203 np = tnp;
3204 } else {
3206 * do not give list a reference until everything
3207 * succeeds
3209 insque(np, &nfs4_server_lst);
3211 mutex_exit(&nfs4_server_lst_lock);
3213 ASSERT(MUTEX_HELD(&np->s_lock));
3215 * If we find the server already has N4S_CLIENTID_SET, then
3216 * just return, we've already done SETCLIENTID to that server
3218 if (np->s_flags & N4S_CLIENTID_SET) {
3219 /* add mi to np's mntinfo4_list */
3220 nfs4_add_mi_to_server(np, mi);
3221 if (!recovery)
3222 nfs_rw_exit(&mi->mi_recovlock);
3223 mutex_exit(&np->s_lock);
3224 nfs4_server_rele(np);
3225 return;
3227 mutex_exit(&np->s_lock);
3231 * Drop the mi_recovlock since nfs4_start_op will
3232 * acquire it again for us.
3234 if (!recovery) {
3235 nfs_rw_exit(&mi->mi_recovlock);
3237 n4ep->error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3238 if (n4ep->error) {
3239 nfs4_server_rele(np);
3240 return;
3244 mutex_enter(&np->s_lock);
3245 while (np->s_flags & N4S_CLIENTID_PEND) {
3246 if (!cv_wait_sig(&np->s_clientid_pend, &np->s_lock)) {
3247 mutex_exit(&np->s_lock);
3248 nfs4_server_rele(np);
3249 if (!recovery)
3250 nfs4_end_op(mi, NULL, NULL, &recov_state,
3251 recovery);
3252 n4ep->error = EINTR;
3253 return;
3257 if (np->s_flags & N4S_CLIENTID_SET) {
3258 /* XXX copied/pasted from above */
3259 /* add mi to np's mntinfo4_list */
3260 nfs4_add_mi_to_server(np, mi);
3261 mutex_exit(&np->s_lock);
3262 nfs4_server_rele(np);
3263 if (!recovery)
3264 nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3265 return;
3269 * Reset the N4S_CB_PINGED flag. This is used to
3270 * indicate if we have received a CB_NULL from the
3271 * server. Also we reset the waiter flag.
3273 np->s_flags &= ~(N4S_CB_PINGED | N4S_CB_WAITER);
3274 /* any failure must now clear this flag */
3275 np->s_flags |= N4S_CLIENTID_PEND;
3276 mutex_exit(&np->s_lock);
3277 nfs4setclientid_otw(mi, svp, cr, np, n4ep, &retry_inuse);
3279 if (n4ep->error == EACCES) {
3281 * If the uid is set then set the creds for secure mounts
3282 * by proxy processes such as automountd.
3284 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3285 if (svp->sv_secdata->uid != 0) {
3286 lcr = crdup(cr);
3287 (void) crsetugid(lcr, svp->sv_secdata->uid,
3288 crgetgid(cr));
3290 nfs_rw_exit(&svp->sv_lock);
3292 if (lcr != NULL) {
3293 mutex_enter(&np->s_lock);
3294 crfree(np->s_cred);
3295 np->s_cred = lcr;
3296 mutex_exit(&np->s_lock);
3297 nfs4setclientid_otw(mi, svp, lcr, np, n4ep,
3298 &retry_inuse);
3301 mutex_enter(&np->s_lock);
3302 lease_time = np->s_lease_time;
3303 np->s_flags &= ~N4S_CLIENTID_PEND;
3304 mutex_exit(&np->s_lock);
3306 if (n4ep->error != 0 || n4ep->stat != NFS4_OK) {
3308 * Start recovery if failover is a possibility. If
3309 * invoked by the recovery thread itself, then just
3310 * return and let it handle the failover first. NB:
3311 * recovery is not allowed if the mount is in progress
3312 * since the infrastructure is not sufficiently setup
3313 * to allow it. Just return the error (after suitable
3314 * retries).
3316 if (FAILOVER_MOUNT4(mi) && nfs4_try_failover(n4ep)) {
3317 (void) nfs4_start_recovery(n4ep, mi, NULL,
3318 NULL, NULL, NULL, OP_SETCLIENTID, NULL, NULL, NULL);
3320 * Don't retry here, just return and let
3321 * recovery take over.
3323 if (recovery)
3324 retry = FALSE;
3325 } else if (nfs4_rpc_retry_error(n4ep->error) ||
3326 n4ep->stat == NFS4ERR_RESOURCE ||
3327 n4ep->stat == NFS4ERR_STALE_CLIENTID) {
3329 retry = TRUE;
3331 * Always retry if in recovery or once had
3332 * contact with the server (but now it's
3333 * overloaded).
3335 if (recovery == TRUE ||
3336 n4ep->error == ETIMEDOUT ||
3337 n4ep->error == ECONNRESET)
3338 num_retries = 0;
3339 } else if (retry_inuse && n4ep->error == 0 &&
3340 n4ep->stat == NFS4ERR_CLID_INUSE) {
3341 retry = TRUE;
3342 num_retries = 0;
3344 } else {
3346 * Since everything succeeded give the list a reference count if
3347 * it hasn't been given one by add_new_nfs4_server() or if this
3348 * is not a recovery situation in which case it is already on
3349 * the list.
3351 mutex_enter(&np->s_lock);
3352 if ((np->s_flags & N4S_INSERTED) == 0) {
3353 np->s_refcnt++;
3354 np->s_flags |= N4S_INSERTED;
3356 mutex_exit(&np->s_lock);
3359 if (!recovery)
3360 nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3363 if (retry && num_retries++ < nfs4_num_sclid_retries) {
3364 if (retry_inuse) {
3365 delay(SEC_TO_TICK(lease_time + nfs4_retry_sclid_delay));
3366 retry_inuse = 0;
3367 } else
3368 delay(SEC_TO_TICK(nfs4_retry_sclid_delay));
3370 nfs4_server_rele(np);
3371 goto recov_retry;
3375 if (n4ep->error == 0)
3376 n4ep->error = geterrno4(n4ep->stat);
3378 /* broadcast before release in case no other threads are waiting */
3379 cv_broadcast(&np->s_clientid_pend);
3380 nfs4_server_rele(np);
3383 int nfs4setclientid_otw_debug = 0;
3386 * This function handles the recovery of STALE_CLIENTID for SETCLIENTID_CONFRIM,
3387 * but nothing else; the calling function must be designed to handle those
3388 * other errors.
3390 static void
3391 nfs4setclientid_otw(mntinfo4_t *mi, struct servinfo4 *svp, cred_t *cr,
3392 struct nfs4_server *np, nfs4_error_t *ep, int *retry_inusep)
3394 COMPOUND4args_clnt args;
3395 COMPOUND4res_clnt res;
3396 nfs_argop4 argop[3];
3397 SETCLIENTID4args *s_args;
3398 SETCLIENTID4resok *s_resok;
3399 int doqueue = 1;
3400 nfs4_ga_res_t *garp = NULL;
3401 timespec_t prop_time, after_time;
3402 verifier4 verf;
3403 clientid4 tmp_clientid;
3405 ASSERT(!MUTEX_HELD(&np->s_lock));
3407 args.ctag = TAG_SETCLIENTID;
3409 args.array = argop;
3410 args.array_len = 3;
3412 /* PUTROOTFH */
3413 argop[0].argop = OP_PUTROOTFH;
3415 /* GETATTR */
3416 argop[1].argop = OP_GETATTR;
3417 argop[1].nfs_argop4_u.opgetattr.attr_request = FATTR4_LEASE_TIME_MASK;
3418 argop[1].nfs_argop4_u.opgetattr.mi = mi;
3420 /* SETCLIENTID */
3421 argop[2].argop = OP_SETCLIENTID;
3423 s_args = &argop[2].nfs_argop4_u.opsetclientid;
3425 mutex_enter(&np->s_lock);
3427 s_args->client.verifier = np->clidtosend.verifier;
3428 s_args->client.id_len = np->clidtosend.id_len;
3429 ASSERT(s_args->client.id_len <= NFS4_OPAQUE_LIMIT);
3430 s_args->client.id_val = np->clidtosend.id_val;
3433 * Callback needs to happen on non-RDMA transport
3434 * Check if we have saved the original knetconfig
3435 * if so, use that instead.
3437 if (svp->sv_origknconf != NULL)
3438 nfs4_cb_args(np, svp->sv_origknconf, s_args);
3439 else
3440 nfs4_cb_args(np, svp->sv_knconf, s_args);
3442 mutex_exit(&np->s_lock);
3444 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
3446 if (ep->error)
3447 return;
3449 /* getattr lease_time res */
3450 if ((res.array_len >= 2) &&
3451 (res.array[1].nfs_resop4_u.opgetattr.status == NFS4_OK)) {
3452 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
3454 #ifndef _LP64
3456 * The 32 bit client cannot handle a lease time greater than
3457 * (INT32_MAX/1000000). This is due to the use of the
3458 * lease_time in calls to drv_usectohz() in
3459 * nfs4_renew_lease_thread(). The problem is that
3460 * drv_usectohz() takes a time_t (which is just a long = 4
3461 * bytes) as its parameter. The lease_time is multiplied by
3462 * 1000000 to convert seconds to usecs for the parameter. If
3463 * a number bigger than (INT32_MAX/1000000) is used then we
3464 * overflow on the 32bit client.
3466 if (garp->n4g_ext_res->n4g_leasetime > (INT32_MAX/1000000)) {
3467 garp->n4g_ext_res->n4g_leasetime = INT32_MAX/1000000;
3469 #endif
3471 mutex_enter(&np->s_lock);
3472 np->s_lease_time = garp->n4g_ext_res->n4g_leasetime;
3475 * Keep track of the lease period for the mi's
3476 * mi_msg_list. We need an appropiate time
3477 * bound to associate past facts with a current
3478 * event. The lease period is perfect for this.
3480 mutex_enter(&mi->mi_msg_list_lock);
3481 mi->mi_lease_period = np->s_lease_time;
3482 mutex_exit(&mi->mi_msg_list_lock);
3483 mutex_exit(&np->s_lock);
3487 if (res.status == NFS4ERR_CLID_INUSE) {
3488 clientaddr4 *clid_inuse;
3490 if (!(*retry_inusep)) {
3491 clid_inuse = &res.array->nfs_resop4_u.
3492 opsetclientid.SETCLIENTID4res_u.client_using;
3494 zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3495 "NFS4 mount (SETCLIENTID failed)."
3496 " nfs4_client_id.id is in"
3497 "use already by: r_netid<%s> r_addr<%s>",
3498 clid_inuse->r_netid, clid_inuse->r_addr);
3502 * XXX - The client should be more robust in its
3503 * handling of clientid in use errors (regen another
3504 * clientid and try again?)
3506 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3507 return;
3510 if (res.status) {
3511 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3512 return;
3515 s_resok = &res.array[2].nfs_resop4_u.
3516 opsetclientid.SETCLIENTID4res_u.resok4;
3518 tmp_clientid = s_resok->clientid;
3520 verf = s_resok->setclientid_confirm;
3522 #ifdef DEBUG
3523 if (nfs4setclientid_otw_debug) {
3524 union {
3525 clientid4 clientid;
3526 int foo[2];
3527 } cid;
3529 cid.clientid = s_resok->clientid;
3531 zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3532 "nfs4setclientid_otw: OK, clientid = %x,%x, "
3533 "verifier = %" PRIx64 "\n", cid.foo[0], cid.foo[1], verf);
3535 #endif
3537 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3539 /* Confirm the client id and get the lease_time attribute */
3541 args.ctag = TAG_SETCLIENTID_CF;
3543 args.array = argop;
3544 args.array_len = 1;
3546 argop[0].argop = OP_SETCLIENTID_CONFIRM;
3548 argop[0].nfs_argop4_u.opsetclientid_confirm.clientid = tmp_clientid;
3549 argop[0].nfs_argop4_u.opsetclientid_confirm.setclientid_confirm = verf;
3551 /* used to figure out RTT for np */
3552 gethrestime(&prop_time);
3554 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlientid_otw: "
3555 "start time: %ld sec %ld nsec", prop_time.tv_sec,
3556 prop_time.tv_nsec));
3558 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
3560 gethrestime(&after_time);
3561 mutex_enter(&np->s_lock);
3562 np->propagation_delay.tv_sec =
3563 MAX(1, after_time.tv_sec - prop_time.tv_sec);
3564 mutex_exit(&np->s_lock);
3566 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setlcientid_otw: "
3567 "finish time: %ld sec ", after_time.tv_sec));
3569 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4setclientid_otw: "
3570 "propagation delay set to %ld sec",
3571 np->propagation_delay.tv_sec));
3573 if (ep->error)
3574 return;
3576 if (res.status == NFS4ERR_CLID_INUSE) {
3577 clientaddr4 *clid_inuse;
3579 if (!(*retry_inusep)) {
3580 clid_inuse = &res.array->nfs_resop4_u.
3581 opsetclientid.SETCLIENTID4res_u.client_using;
3583 zcmn_err(mi->mi_zone->zone_id, CE_NOTE,
3584 "SETCLIENTID_CONFIRM failed. "
3585 "nfs4_client_id.id is in use already by: "
3586 "r_netid<%s> r_addr<%s>",
3587 clid_inuse->r_netid, clid_inuse->r_addr);
3590 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3591 return;
3594 if (res.status) {
3595 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3596 return;
3599 mutex_enter(&np->s_lock);
3600 np->clientid = tmp_clientid;
3601 np->s_flags |= N4S_CLIENTID_SET;
3603 /* Add mi to np's mntinfo4 list */
3604 nfs4_add_mi_to_server(np, mi);
3606 if (np->lease_valid == NFS4_LEASE_NOT_STARTED) {
3608 * Start lease management thread.
3609 * Keep trying until we succeed.
3612 np->s_refcnt++; /* pass reference to thread */
3613 (void) zthread_create(NULL, 0, nfs4_renew_lease_thread, np, 0,
3614 minclsyspri);
3616 mutex_exit(&np->s_lock);
3618 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3622 * Add mi to sp's mntinfo4_list if it isn't already in the list. Makes
3623 * mi's clientid the same as sp's.
3624 * Assumes sp is locked down.
3626 void
3627 nfs4_add_mi_to_server(nfs4_server_t *sp, mntinfo4_t *mi)
3629 mntinfo4_t *tmi;
3630 int in_list = 0;
3632 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
3633 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3634 ASSERT(sp != &nfs4_server_lst);
3635 ASSERT(MUTEX_HELD(&sp->s_lock));
3637 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3638 "nfs4_add_mi_to_server: add mi %p to sp %p",
3639 (void*)mi, (void*)sp));
3641 for (tmi = sp->mntinfo4_list;
3642 tmi != NULL;
3643 tmi = tmi->mi_clientid_next) {
3644 if (tmi == mi) {
3645 NFS4_DEBUG(nfs4_client_lease_debug,
3646 (CE_NOTE,
3647 "nfs4_add_mi_to_server: mi in list"));
3648 in_list = 1;
3653 * First put a hold on the mntinfo4's vfsp so that references via
3654 * mntinfo4_list will be valid.
3656 if (!in_list)
3657 VFS_HOLD(mi->mi_vfsp);
3659 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4_add_mi_to_server: "
3660 "hold vfs %p for mi: %p", (void*)mi->mi_vfsp, (void*)mi));
3662 if (!in_list) {
3663 if (sp->mntinfo4_list)
3664 sp->mntinfo4_list->mi_clientid_prev = mi;
3665 mi->mi_clientid_next = sp->mntinfo4_list;
3666 mi->mi_srv = sp;
3667 sp->mntinfo4_list = mi;
3668 mi->mi_srvsettime = gethrestime_sec();
3669 mi->mi_srvset_cnt++;
3672 /* set mi's clientid to that of sp's for later matching */
3673 mi->mi_clientid = sp->clientid;
3676 * Update the clientid for any other mi's belonging to sp. This
3677 * must be done here while we hold sp->s_lock, so that
3678 * find_nfs4_server() continues to work.
3681 for (tmi = sp->mntinfo4_list;
3682 tmi != NULL;
3683 tmi = tmi->mi_clientid_next) {
3684 if (tmi != mi) {
3685 tmi->mi_clientid = sp->clientid;
3691 * Remove the mi from sp's mntinfo4_list and release its reference.
3692 * Exception: if mi still has open files, flag it for later removal (when
3693 * all the files are closed).
3695 * If this is the last mntinfo4 in sp's list then tell the lease renewal
3696 * thread to exit.
3698 static void
3699 nfs4_remove_mi_from_server_nolock(mntinfo4_t *mi, nfs4_server_t *sp)
3701 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3702 "nfs4_remove_mi_from_server_nolock: remove mi %p from sp %p",
3703 (void*)mi, (void*)sp));
3705 ASSERT(sp != NULL);
3706 ASSERT(MUTEX_HELD(&sp->s_lock));
3707 ASSERT(mi->mi_open_files >= 0);
3710 * First make sure this mntinfo4 can be taken off of the list,
3711 * ie: it doesn't have any open files remaining.
3713 if (mi->mi_open_files > 0) {
3714 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3715 "nfs4_remove_mi_from_server_nolock: don't "
3716 "remove mi since it still has files open"));
3718 mutex_enter(&mi->mi_lock);
3719 mi->mi_flags |= MI4_REMOVE_ON_LAST_CLOSE;
3720 mutex_exit(&mi->mi_lock);
3721 return;
3724 VFS_HOLD(mi->mi_vfsp);
3725 remove_mi(sp, mi);
3726 VFS_RELE(mi->mi_vfsp);
3728 if (sp->mntinfo4_list == NULL) {
3729 /* last fs unmounted, kill the thread */
3730 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3731 "remove_mi_from_nfs4_server_nolock: kill the thread"));
3732 nfs4_mark_srv_dead(sp);
3737 * Remove mi from sp's mntinfo4_list and release the vfs reference.
3739 static void
3740 remove_mi(nfs4_server_t *sp, mntinfo4_t *mi)
3742 ASSERT(MUTEX_HELD(&sp->s_lock));
3745 * We release a reference, and the caller must still have a
3746 * reference.
3748 ASSERT(mi->mi_vfsp->vfs_count >= 2);
3750 if (mi->mi_clientid_prev) {
3751 mi->mi_clientid_prev->mi_clientid_next = mi->mi_clientid_next;
3752 } else {
3753 /* This is the first mi in sp's mntinfo4_list */
3755 * Make sure the first mntinfo4 in the list is the actual
3756 * mntinfo4 passed in.
3758 ASSERT(sp->mntinfo4_list == mi);
3760 sp->mntinfo4_list = mi->mi_clientid_next;
3762 if (mi->mi_clientid_next)
3763 mi->mi_clientid_next->mi_clientid_prev = mi->mi_clientid_prev;
3765 /* Now mark the mntinfo4's links as being removed */
3766 mi->mi_clientid_prev = mi->mi_clientid_next = NULL;
3767 mi->mi_srv = NULL;
3768 mi->mi_srvset_cnt++;
3770 VFS_RELE(mi->mi_vfsp);
3774 * Free all the entries in sp's mntinfo4_list.
3776 static void
3777 remove_all_mi(nfs4_server_t *sp)
3779 mntinfo4_t *mi;
3781 ASSERT(MUTEX_HELD(&sp->s_lock));
3783 while (sp->mntinfo4_list != NULL) {
3784 mi = sp->mntinfo4_list;
3786 * Grab a reference in case there is only one left (which
3787 * remove_mi() frees).
3789 VFS_HOLD(mi->mi_vfsp);
3790 remove_mi(sp, mi);
3791 VFS_RELE(mi->mi_vfsp);
3796 * Remove the mi from sp's mntinfo4_list as above, and rele the vfs.
3798 * This version can be called with a null nfs4_server_t arg,
3799 * and will either find the right one and handle locking, or
3800 * do nothing because the mi wasn't added to an sp's mntinfo4_list.
3802 void
3803 nfs4_remove_mi_from_server(mntinfo4_t *mi, nfs4_server_t *esp)
3805 nfs4_server_t *sp;
3807 if (esp) {
3808 nfs4_remove_mi_from_server_nolock(mi, esp);
3809 return;
3812 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3813 if (sp = find_nfs4_server_all(mi, 1)) {
3814 nfs4_remove_mi_from_server_nolock(mi, sp);
3815 mutex_exit(&sp->s_lock);
3816 nfs4_server_rele(sp);
3818 nfs_rw_exit(&mi->mi_recovlock);
3822 * Return TRUE if the given server has any non-unmounted filesystems.
3825 bool_t
3826 nfs4_fs_active(nfs4_server_t *sp)
3828 mntinfo4_t *mi;
3830 ASSERT(MUTEX_HELD(&sp->s_lock));
3832 for (mi = sp->mntinfo4_list; mi != NULL; mi = mi->mi_clientid_next) {
3833 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
3834 return (TRUE);
3837 return (FALSE);
3841 * Mark sp as finished and notify any waiters.
3844 void
3845 nfs4_mark_srv_dead(nfs4_server_t *sp)
3847 ASSERT(MUTEX_HELD(&sp->s_lock));
3849 sp->s_thread_exit = NFS4_THREAD_EXIT;
3850 cv_broadcast(&sp->cv_thread_exit);
3854 * Create a new nfs4_server_t structure.
3855 * Returns new node unlocked and not in list, but with a reference count of
3856 * 1.
3858 struct nfs4_server *
3859 new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3861 struct nfs4_server *np;
3862 timespec_t tt;
3863 union {
3864 struct {
3865 uint32_t sec;
3866 uint32_t subsec;
3867 } un_curtime;
3868 verifier4 un_verifier;
3869 } nfs4clientid_verifier;
3871 * We change this ID string carefully and with the Solaris
3872 * NFS server behaviour in mind. "+referrals" indicates
3873 * a client that can handle an NFSv4 referral.
3875 char id_val[] = "Solaris: %s, NFSv4 kernel client +referrals";
3876 int len;
3878 np = kmem_zalloc(sizeof (struct nfs4_server), KM_SLEEP);
3879 np->saddr.len = svp->sv_addr.len;
3880 np->saddr.maxlen = svp->sv_addr.maxlen;
3881 np->saddr.buf = kmem_alloc(svp->sv_addr.maxlen, KM_SLEEP);
3882 bcopy(svp->sv_addr.buf, np->saddr.buf, svp->sv_addr.len);
3883 np->s_refcnt = 1;
3886 * Build the nfs_client_id4 for this server mount. Ensure
3887 * the verifier is useful and that the identification is
3888 * somehow based on the server's address for the case of
3889 * multi-homed servers.
3891 nfs4clientid_verifier.un_verifier = 0;
3892 gethrestime(&tt);
3893 nfs4clientid_verifier.un_curtime.sec = (uint32_t)tt.tv_sec;
3894 nfs4clientid_verifier.un_curtime.subsec = (uint32_t)tt.tv_nsec;
3895 np->clidtosend.verifier = nfs4clientid_verifier.un_verifier;
3898 * calculate the length of the opaque identifier. Subtract 2
3899 * for the "%s" and add the traditional +1 for null
3900 * termination.
3902 len = strlen(id_val) - 2 + strlen(uts_nodename()) + 1;
3903 np->clidtosend.id_len = len + np->saddr.maxlen;
3905 np->clidtosend.id_val = kmem_alloc(np->clidtosend.id_len, KM_SLEEP);
3906 (void) sprintf(np->clidtosend.id_val, id_val, uts_nodename());
3907 bcopy(np->saddr.buf, &np->clidtosend.id_val[len], np->saddr.len);
3909 np->s_flags = 0;
3910 np->mntinfo4_list = NULL;
3911 /* save cred for issuing rfs4calls inside the renew thread */
3912 crhold(cr);
3913 np->s_cred = cr;
3914 cv_init(&np->cv_thread_exit, NULL, CV_DEFAULT, NULL);
3915 mutex_init(&np->s_lock, NULL, MUTEX_DEFAULT, NULL);
3916 nfs_rw_init(&np->s_recovlock, NULL, RW_DEFAULT, NULL);
3917 list_create(&np->s_deleg_list, sizeof (rnode4_t),
3918 offsetof(rnode4_t, r_deleg_link));
3919 np->s_thread_exit = 0;
3920 np->state_ref_count = 0;
3921 np->lease_valid = NFS4_LEASE_NOT_STARTED;
3922 cv_init(&np->s_cv_otw_count, NULL, CV_DEFAULT, NULL);
3923 cv_init(&np->s_clientid_pend, NULL, CV_DEFAULT, NULL);
3924 np->s_otw_call_count = 0;
3925 cv_init(&np->wait_cb_null, NULL, CV_DEFAULT, NULL);
3926 np->zoneid = getzoneid();
3927 np->zone_globals = nfs4_get_callback_globals();
3928 ASSERT(np->zone_globals != NULL);
3929 return (np);
3933 * Create a new nfs4_server_t structure and add it to the list.
3934 * Returns new node locked; reference must eventually be freed.
3936 static struct nfs4_server *
3937 add_new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3939 nfs4_server_t *sp;
3941 ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
3942 sp = new_nfs4_server(svp, cr);
3943 mutex_enter(&sp->s_lock);
3944 insque(sp, &nfs4_server_lst);
3945 sp->s_refcnt++; /* list gets a reference */
3946 sp->s_flags |= N4S_INSERTED;
3947 sp->clientid = 0;
3948 return (sp);
3951 int nfs4_server_t_debug = 0;
3954 #ifdef DEBUG
3955 void
3956 dumpnfs4slist(char *txt, mntinfo4_t *mi, clientid4 clientid, servinfo4_t *srv_p)
3958 int hash16(void *p, int len);
3959 nfs4_server_t *np;
3961 NFS4_DEBUG(nfs4_server_t_debug, (CE_NOTE,
3962 "dumping nfs4_server_t list in %s", txt));
3963 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3964 "mi 0x%p, want clientid %llx, addr %d/%04X",
3965 mi, (longlong_t)clientid, srv_p->sv_addr.len,
3966 hash16((void *)srv_p->sv_addr.buf, srv_p->sv_addr.len)));
3967 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst;
3968 np = np->forw) {
3969 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3970 "node 0x%p, clientid %llx, addr %d/%04X, cnt %d",
3971 np, (longlong_t)np->clientid, np->saddr.len,
3972 hash16((void *)np->saddr.buf, np->saddr.len),
3973 np->state_ref_count));
3974 if (np->saddr.len == srv_p->sv_addr.len &&
3975 bcmp(np->saddr.buf, srv_p->sv_addr.buf,
3976 np->saddr.len) == 0)
3977 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3978 " - address matches"));
3979 if (np->clientid == clientid || np->clientid == 0)
3980 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3981 " - clientid matches"));
3982 if (np->s_thread_exit != NFS4_THREAD_EXIT)
3983 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3984 " - thread not exiting"));
3986 delay(hz);
3988 #endif
3992 * Move a mntinfo4_t from one server list to another.
3993 * Locking of the two nfs4_server_t nodes will be done in list order.
3995 * Returns NULL if the current nfs4_server_t for the filesystem could not
3996 * be found (e.g., due to forced unmount). Otherwise returns a reference
3997 * to the new nfs4_server_t, which must eventually be freed.
3999 nfs4_server_t *
4000 nfs4_move_mi(mntinfo4_t *mi, servinfo4_t *old, servinfo4_t *new)
4002 nfs4_server_t *p, *op = NULL, *np = NULL;
4003 int num_open;
4004 zoneid_t zoneid = nfs_zoneid();
4006 ASSERT(nfs_zone() == mi->mi_zone);
4008 mutex_enter(&nfs4_server_lst_lock);
4009 #ifdef DEBUG
4010 if (nfs4_server_t_debug)
4011 dumpnfs4slist("nfs4_move_mi", mi, (clientid4)0, new);
4012 #endif
4013 for (p = nfs4_server_lst.forw; p != &nfs4_server_lst; p = p->forw) {
4014 if (p->zoneid != zoneid)
4015 continue;
4016 if (p->saddr.len == old->sv_addr.len &&
4017 bcmp(p->saddr.buf, old->sv_addr.buf, p->saddr.len) == 0 &&
4018 p->s_thread_exit != NFS4_THREAD_EXIT) {
4019 op = p;
4020 mutex_enter(&op->s_lock);
4021 op->s_refcnt++;
4023 if (p->saddr.len == new->sv_addr.len &&
4024 bcmp(p->saddr.buf, new->sv_addr.buf, p->saddr.len) == 0 &&
4025 p->s_thread_exit != NFS4_THREAD_EXIT) {
4026 np = p;
4027 mutex_enter(&np->s_lock);
4029 if (op != NULL && np != NULL)
4030 break;
4032 if (op == NULL) {
4034 * Filesystem has been forcibly unmounted. Bail out.
4036 if (np != NULL)
4037 mutex_exit(&np->s_lock);
4038 mutex_exit(&nfs4_server_lst_lock);
4039 return (NULL);
4041 if (np != NULL) {
4042 np->s_refcnt++;
4043 } else {
4044 #ifdef DEBUG
4045 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4046 "nfs4_move_mi: no target nfs4_server, will create."));
4047 #endif
4048 np = add_new_nfs4_server(new, kcred);
4050 mutex_exit(&nfs4_server_lst_lock);
4052 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4053 "nfs4_move_mi: for mi 0x%p, "
4054 "old servinfo4 0x%p, new servinfo4 0x%p, "
4055 "old nfs4_server 0x%p, new nfs4_server 0x%p, ",
4056 (void*)mi, (void*)old, (void*)new,
4057 (void*)op, (void*)np));
4058 ASSERT(op != NULL && np != NULL);
4060 /* discard any delegations */
4061 nfs4_deleg_discard(mi, op);
4063 num_open = mi->mi_open_files;
4064 mi->mi_open_files = 0;
4065 op->state_ref_count -= num_open;
4066 ASSERT(op->state_ref_count >= 0);
4067 np->state_ref_count += num_open;
4068 nfs4_remove_mi_from_server_nolock(mi, op);
4069 mi->mi_open_files = num_open;
4070 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
4071 "nfs4_move_mi: mi_open_files %d, op->cnt %d, np->cnt %d",
4072 mi->mi_open_files, op->state_ref_count, np->state_ref_count));
4074 nfs4_add_mi_to_server(np, mi);
4076 mutex_exit(&op->s_lock);
4077 mutex_exit(&np->s_lock);
4078 nfs4_server_rele(op);
4080 return (np);
4084 * Need to have the nfs4_server_lst_lock.
4085 * Search the nfs4_server list to find a match on this servinfo4
4086 * based on its address.
4088 * Returns NULL if no match is found. Otherwise returns a reference (which
4089 * must eventually be freed) to a locked nfs4_server.
4091 nfs4_server_t *
4092 servinfo4_to_nfs4_server(servinfo4_t *srv_p)
4094 nfs4_server_t *np;
4095 zoneid_t zoneid = nfs_zoneid();
4097 ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
4098 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4099 if (np->zoneid == zoneid &&
4100 np->saddr.len == srv_p->sv_addr.len &&
4101 bcmp(np->saddr.buf, srv_p->sv_addr.buf,
4102 np->saddr.len) == 0 &&
4103 np->s_thread_exit != NFS4_THREAD_EXIT) {
4104 mutex_enter(&np->s_lock);
4105 np->s_refcnt++;
4106 return (np);
4109 return (NULL);
4113 * Locks the nfs4_server down if it is found and returns a reference that
4114 * must eventually be freed.
4116 static nfs4_server_t *
4117 lookup_nfs4_server(nfs4_server_t *sp, int any_state)
4119 nfs4_server_t *np;
4121 mutex_enter(&nfs4_server_lst_lock);
4122 for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4123 mutex_enter(&np->s_lock);
4124 if (np == sp && np->s_refcnt > 0 &&
4125 (np->s_thread_exit != NFS4_THREAD_EXIT || any_state)) {
4126 mutex_exit(&nfs4_server_lst_lock);
4127 np->s_refcnt++;
4128 return (np);
4130 mutex_exit(&np->s_lock);
4132 mutex_exit(&nfs4_server_lst_lock);
4134 return (NULL);
4138 * The caller should be holding mi->mi_recovlock, and it should continue to
4139 * hold the lock until done with the returned nfs4_server_t. Once
4140 * mi->mi_recovlock is released, there is no guarantee that the returned
4141 * mi->nfs4_server_t will continue to correspond to mi.
4143 nfs4_server_t *
4144 find_nfs4_server(mntinfo4_t *mi)
4146 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4147 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4149 return (lookup_nfs4_server(mi->mi_srv, 0));
4153 * Same as above, but takes an "any_state" parameter which can be
4154 * set to 1 if the caller wishes to find nfs4_server_t's which
4155 * have been marked for termination by the exit of the renew
4156 * thread. This should only be used by operations which are
4157 * cleaning up and will not cause an OTW op.
4159 nfs4_server_t *
4160 find_nfs4_server_all(mntinfo4_t *mi, int any_state)
4162 ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
4163 nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
4165 return (lookup_nfs4_server(mi->mi_srv, any_state));
4169 * Lock sp, but only if it's still active (in the list and hasn't been
4170 * flagged as exiting) or 'any_state' is non-zero.
4171 * Returns TRUE if sp got locked and adds a reference to sp.
4173 bool_t
4174 nfs4_server_vlock(nfs4_server_t *sp, int any_state)
4176 return (lookup_nfs4_server(sp, any_state) != NULL);
4180 * Release the reference to sp and destroy it if that's the last one.
4183 void
4184 nfs4_server_rele(nfs4_server_t *sp)
4186 mutex_enter(&sp->s_lock);
4187 ASSERT(sp->s_refcnt > 0);
4188 sp->s_refcnt--;
4189 if (sp->s_refcnt > 0) {
4190 mutex_exit(&sp->s_lock);
4191 return;
4193 mutex_exit(&sp->s_lock);
4195 mutex_enter(&nfs4_server_lst_lock);
4196 mutex_enter(&sp->s_lock);
4197 if (sp->s_refcnt > 0) {
4198 mutex_exit(&sp->s_lock);
4199 mutex_exit(&nfs4_server_lst_lock);
4200 return;
4202 remque(sp);
4203 sp->forw = sp->back = NULL;
4204 mutex_exit(&nfs4_server_lst_lock);
4205 destroy_nfs4_server(sp);
4208 static void
4209 destroy_nfs4_server(nfs4_server_t *sp)
4211 ASSERT(MUTEX_HELD(&sp->s_lock));
4212 ASSERT(sp->s_refcnt == 0);
4213 ASSERT(sp->s_otw_call_count == 0);
4215 remove_all_mi(sp);
4217 crfree(sp->s_cred);
4218 kmem_free(sp->saddr.buf, sp->saddr.maxlen);
4219 kmem_free(sp->clidtosend.id_val, sp->clidtosend.id_len);
4220 mutex_exit(&sp->s_lock);
4222 /* destroy the nfs4_server */
4223 nfs4callback_destroy(sp);
4224 list_destroy(&sp->s_deleg_list);
4225 mutex_destroy(&sp->s_lock);
4226 cv_destroy(&sp->cv_thread_exit);
4227 cv_destroy(&sp->s_cv_otw_count);
4228 cv_destroy(&sp->s_clientid_pend);
4229 cv_destroy(&sp->wait_cb_null);
4230 nfs_rw_destroy(&sp->s_recovlock);
4231 kmem_free(sp, sizeof (*sp));
4235 * Fork off a thread to free the data structures for a mount.
4238 static void
4239 async_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4241 freemountargs_t *args;
4242 args = kmem_alloc(sizeof (freemountargs_t), KM_SLEEP);
4243 args->fm_vfsp = vfsp;
4244 VFS_HOLD(vfsp);
4245 MI4_HOLD(VFTOMI4(vfsp));
4246 args->fm_flag = flag;
4247 args->fm_cr = cr;
4248 crhold(cr);
4249 (void) zthread_create(NULL, 0, nfs4_free_mount_thread, args, 0,
4250 minclsyspri);
4253 static void
4254 nfs4_free_mount_thread(freemountargs_t *args)
4256 mntinfo4_t *mi;
4257 nfs4_free_mount(args->fm_vfsp, args->fm_flag, args->fm_cr);
4258 mi = VFTOMI4(args->fm_vfsp);
4259 crfree(args->fm_cr);
4260 VFS_RELE(args->fm_vfsp);
4261 MI4_RELE(mi);
4262 kmem_free(args, sizeof (freemountargs_t));
4263 zthread_exit();
4264 /* NOTREACHED */
4268 * Thread to free the data structures for a given filesystem.
4270 static void
4271 nfs4_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4273 mntinfo4_t *mi = VFTOMI4(vfsp);
4274 nfs4_server_t *sp;
4275 callb_cpr_t cpr_info;
4276 kmutex_t cpr_lock;
4277 boolean_t async_thread;
4278 int removed;
4280 bool_t must_unlock;
4281 nfs4_ephemeral_tree_t *eph_tree;
4284 * We need to participate in the CPR framework if this is a kernel
4285 * thread.
4287 async_thread = (curproc == nfs_zone()->zone_zsched);
4288 if (async_thread) {
4289 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
4290 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
4291 "nfsv4AsyncUnmount");
4295 * We need to wait for all outstanding OTW calls
4296 * and recovery to finish before we remove the mi
4297 * from the nfs4_server_t, as current pending
4298 * calls might still need this linkage (in order
4299 * to find a nfs4_server_t from a mntinfo4_t).
4301 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
4302 sp = find_nfs4_server(mi);
4303 nfs_rw_exit(&mi->mi_recovlock);
4305 if (sp) {
4306 while (sp->s_otw_call_count != 0) {
4307 if (async_thread) {
4308 mutex_enter(&cpr_lock);
4309 CALLB_CPR_SAFE_BEGIN(&cpr_info);
4310 mutex_exit(&cpr_lock);
4312 cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
4313 if (async_thread) {
4314 mutex_enter(&cpr_lock);
4315 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4316 mutex_exit(&cpr_lock);
4319 mutex_exit(&sp->s_lock);
4320 nfs4_server_rele(sp);
4321 sp = NULL;
4324 mutex_enter(&mi->mi_lock);
4325 while (mi->mi_in_recovery != 0) {
4326 if (async_thread) {
4327 mutex_enter(&cpr_lock);
4328 CALLB_CPR_SAFE_BEGIN(&cpr_info);
4329 mutex_exit(&cpr_lock);
4331 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
4332 if (async_thread) {
4333 mutex_enter(&cpr_lock);
4334 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4335 mutex_exit(&cpr_lock);
4338 mutex_exit(&mi->mi_lock);
4341 * If we got an error, then do not nuke the
4342 * tree. Either the harvester is busy reclaiming
4343 * this node or we ran into some busy condition.
4345 * The harvester will eventually come along and cleanup.
4346 * The only problem would be the root mount point.
4348 * Since the busy node can occur for a variety
4349 * of reasons and can result in an entry staying
4350 * in df output but no longer accessible from the
4351 * directory tree, we are okay.
4353 if (!nfs4_ephemeral_umount(mi, flag, cr,
4354 &must_unlock, &eph_tree))
4355 nfs4_ephemeral_umount_activate(mi, &must_unlock,
4356 &eph_tree);
4359 * The original purge of the dnlc via 'dounmount'
4360 * doesn't guarantee that another dnlc entry was not
4361 * added while we waitied for all outstanding OTW
4362 * and recovery calls to finish. So re-purge the
4363 * dnlc now.
4365 (void) dnlc_purge_vfsp(vfsp, 0);
4368 * We need to explicitly stop the manager thread; the asyc worker
4369 * threads can timeout and exit on their own.
4371 mutex_enter(&mi->mi_async_lock);
4372 mi->mi_max_threads = 0;
4373 NFS4_WAKEALL_ASYNC_WORKERS(mi->mi_async_work_cv);
4374 mutex_exit(&mi->mi_async_lock);
4375 if (mi->mi_manager_thread)
4376 nfs4_async_manager_stop(vfsp);
4378 destroy_rtable4(vfsp, cr);
4380 nfs4_remove_mi_from_server(mi, NULL);
4382 if (async_thread) {
4383 mutex_enter(&cpr_lock);
4384 CALLB_CPR_EXIT(&cpr_info); /* drops cpr_lock */
4385 mutex_destroy(&cpr_lock);
4388 removed = nfs4_mi_zonelist_remove(mi);
4389 if (removed)
4390 zone_rele_ref(&mi->mi_zone_ref, ZONE_REF_NFSV4);
4393 /* Referral related sub-routines */
4395 /* Freeup knetconfig */
4396 static void
4397 free_knconf_contents(struct knetconfig *k)
4399 if (k == NULL)
4400 return;
4401 if (k->knc_protofmly)
4402 kmem_free(k->knc_protofmly, KNC_STRSIZE);
4403 if (k->knc_proto)
4404 kmem_free(k->knc_proto, KNC_STRSIZE);
4408 * This updates newpath variable with exact name component from the
4409 * path which gave us a NFS4ERR_MOVED error.
4410 * If the path is /rp/aaa/bbb and nth value is 1, aaa is returned.
4412 static char *
4413 extract_referral_point(const char *svp, int nth)
4415 int num_slashes = 0;
4416 const char *p;
4417 char *newpath = NULL;
4418 int i = 0;
4420 newpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4421 for (p = svp; *p; p++) {
4422 if (*p == '/')
4423 num_slashes++;
4424 if (num_slashes == nth + 1) {
4425 p++;
4426 while (*p != '/') {
4427 if (*p == '\0')
4428 break;
4429 newpath[i] = *p;
4430 i++;
4431 p++;
4433 newpath[i++] = '\0';
4434 break;
4437 return (newpath);
4441 * This sets up a new path in sv_path to do a lookup of the referral point.
4442 * If the path is /rp/aaa/bbb and the referral point is aaa,
4443 * this updates /rp/aaa. This path will be used to get referral
4444 * location.
4446 static void
4447 setup_newsvpath(servinfo4_t *svp, int nth)
4449 int num_slashes = 0, pathlen, i = 0;
4450 char *newpath, *p;
4452 newpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
4453 for (p = svp->sv_path; *p; p++) {
4454 newpath[i] = *p;
4455 if (*p == '/')
4456 num_slashes++;
4457 if (num_slashes == nth + 1) {
4458 newpath[i] = '\0';
4459 pathlen = strlen(newpath) + 1;
4460 kmem_free(svp->sv_path, svp->sv_pathlen);
4461 svp->sv_path = kmem_alloc(pathlen, KM_SLEEP);
4462 svp->sv_pathlen = pathlen;
4463 bcopy(newpath, svp->sv_path, pathlen);
4464 break;
4466 i++;
4468 kmem_free(newpath, MAXPATHLEN);