drop net-snmp dep
[unleashed.git] / kernel / fs / nfs / nfs_subr.c
blobe5e34ab366fe50cbca661e8a163869ebcd0d1f3d
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
28 * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/proc.h>
36 #include <sys/user.h>
37 #include <sys/time.h>
38 #include <sys/buf.h>
39 #include <sys/vfs.h>
40 #include <sys/vnode.h>
41 #include <sys/socket.h>
42 #include <sys/uio.h>
43 #include <sys/tiuser.h>
44 #include <sys/swap.h>
45 #include <sys/errno.h>
46 #include <sys/debug.h>
47 #include <sys/kmem.h>
48 #include <sys/kstat.h>
49 #include <sys/cmn_err.h>
50 #include <sys/vtrace.h>
51 #include <sys/session.h>
52 #include <sys/dnlc.h>
53 #include <sys/bitmap.h>
54 #include <sys/acl.h>
55 #include <sys/ddi.h>
56 #include <sys/pathname.h>
57 #include <sys/flock.h>
58 #include <sys/dirent.h>
59 #include <sys/flock.h>
60 #include <sys/callb.h>
61 #include <sys/atomic.h>
62 #include <sys/list.h>
63 #include <sys/priv.h>
64 #include <sys/sdt.h>
65 #include <sys/attr.h>
67 #include <inet/ip6.h>
69 #include <rpc/types.h>
70 #include <rpc/xdr.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
74 #include <nfs/nfs.h>
75 #include <nfs/nfs4.h>
76 #include <nfs/nfs_clnt.h>
77 #include <nfs/rnode.h>
78 #include <nfs/nfs_acl.h>
81 * The hash queues for the access to active and cached rnodes
82 * are organized as doubly linked lists. A reader/writer lock
83 * for each hash bucket is used to control access and to synchronize
84 * lookups, additions, and deletions from the hash queue.
86 * The rnode freelist is organized as a doubly linked list with
87 * a head pointer. Additions and deletions are synchronized via
88 * a single mutex.
90 * In order to add an rnode to the free list, it must be hashed into
91 * a hash queue and the exclusive lock to the hash queue be held.
92 * If an rnode is not hashed into a hash queue, then it is destroyed
93 * because it represents no valuable information that can be reused
94 * about the file. The exclusive lock to the hash queue must be
95 * held in order to prevent a lookup in the hash queue from finding
96 * the rnode and using it and assuming that the rnode is not on the
97 * freelist. The lookup in the hash queue will have the hash queue
98 * locked, either exclusive or shared.
100 * The vnode reference count for each rnode is not allowed to drop
101 * below 1. This prevents external entities, such as the VM
102 * subsystem, from acquiring references to vnodes already on the
103 * freelist and then trying to place them back on the freelist
104 * when their reference is released. This means that the when an
105 * rnode is looked up in the hash queues, then either the rnode
106 * is removed from the freelist and that reference is transferred to
107 * the new reference or the vnode reference count must be incremented
108 * accordingly. The mutex for the freelist must be held in order to
109 * accurately test to see if the rnode is on the freelist or not.
110 * The hash queue lock might be held shared and it is possible that
111 * two different threads may race to remove the rnode from the
112 * freelist. This race can be resolved by holding the mutex for the
113 * freelist. Please note that the mutex for the freelist does not
114 * need to held if the rnode is not on the freelist. It can not be
115 * placed on the freelist due to the requirement that the thread
116 * putting the rnode on the freelist must hold the exclusive lock
117 * to the hash queue and the thread doing the lookup in the hash
118 * queue is holding either a shared or exclusive lock to the hash
119 * queue.
121 * The lock ordering is:
123 * hash bucket lock -> vnode lock
124 * hash bucket lock -> freelist lock
126 static rhashq_t *rtable;
128 static kmutex_t rpfreelist_lock;
129 static rnode_t *rpfreelist = NULL;
130 static long rnew = 0;
131 long nrnode = 0;
133 static int rtablesize;
134 static int rtablemask;
136 static int hashlen = 4;
138 static struct kmem_cache *rnode_cache;
141 * Mutex to protect the following variables:
142 * nfs_major
143 * nfs_minor
145 kmutex_t nfs_minor_lock;
146 int nfs_major;
147 int nfs_minor;
149 /* Do we allow preepoch (negative) time values otw? */
150 bool_t nfs_allow_preepoch_time = FALSE; /* default: do not allow preepoch */
153 * Access cache
155 static acache_hash_t *acache;
156 static long nacache; /* used strictly to size the number of hash queues */
158 static int acachesize;
159 static int acachemask;
160 static struct kmem_cache *acache_cache;
163 * Client side utilities
167 * client side statistics
169 static const struct clstat clstat_tmpl = {
170 { "calls", KSTAT_DATA_UINT64 },
171 { "badcalls", KSTAT_DATA_UINT64 },
172 { "clgets", KSTAT_DATA_UINT64 },
173 { "cltoomany", KSTAT_DATA_UINT64 },
174 #ifdef DEBUG
175 { "clalloc", KSTAT_DATA_UINT64 },
176 { "noresponse", KSTAT_DATA_UINT64 },
177 { "failover", KSTAT_DATA_UINT64 },
178 { "remap", KSTAT_DATA_UINT64 },
179 #endif
183 * The following are statistics that describe behavior of the system as a whole
184 * and doesn't correspond to any one particular zone.
186 #ifdef DEBUG
187 static struct clstat_debug {
188 kstat_named_t nrnode; /* number of allocated rnodes */
189 kstat_named_t access; /* size of access cache */
190 kstat_named_t dirent; /* size of readdir cache */
191 kstat_named_t dirents; /* size of readdir buf cache */
192 kstat_named_t reclaim; /* number of reclaims */
193 kstat_named_t clreclaim; /* number of cl reclaims */
194 kstat_named_t f_reclaim; /* number of free reclaims */
195 kstat_named_t a_reclaim; /* number of active reclaims */
196 kstat_named_t r_reclaim; /* number of rnode reclaims */
197 kstat_named_t rpath; /* bytes used to store rpaths */
198 } clstat_debug = {
199 { "nrnode", KSTAT_DATA_UINT64 },
200 { "access", KSTAT_DATA_UINT64 },
201 { "dirent", KSTAT_DATA_UINT64 },
202 { "dirents", KSTAT_DATA_UINT64 },
203 { "reclaim", KSTAT_DATA_UINT64 },
204 { "clreclaim", KSTAT_DATA_UINT64 },
205 { "f_reclaim", KSTAT_DATA_UINT64 },
206 { "a_reclaim", KSTAT_DATA_UINT64 },
207 { "r_reclaim", KSTAT_DATA_UINT64 },
208 { "r_path", KSTAT_DATA_UINT64 },
210 #endif /* DEBUG */
213 * We keep a global list of per-zone client data, so we can clean up all zones
214 * if we get low on memory.
216 static list_t nfs_clnt_list;
217 static kmutex_t nfs_clnt_list_lock;
218 static zone_key_t nfsclnt_zone_key;
220 static struct kmem_cache *chtab_cache;
223 * Some servers do not properly update the attributes of the
224 * directory when changes are made. To allow interoperability
225 * with these broken servers, the nfs_disable_rddir_cache
226 * parameter must be set in /etc/system
228 int nfs_disable_rddir_cache = 0;
230 int clget(clinfo_t *, servinfo_t *, cred_t *, CLIENT **,
231 struct chtab **);
232 void clfree(CLIENT *, struct chtab *);
233 static int acl_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
234 struct chtab **, struct nfs_clnt *);
235 static int nfs_clget(mntinfo_t *, servinfo_t *, cred_t *, CLIENT **,
236 struct chtab **, struct nfs_clnt *);
237 static void clreclaim(void *);
238 static int nfs_feedback(int, int, mntinfo_t *);
239 static int rfscall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
240 caddr_t, cred_t *, int *, enum clnt_stat *, int,
241 failinfo_t *);
242 static int aclcall(mntinfo_t *, rpcproc_t, xdrproc_t, caddr_t, xdrproc_t,
243 caddr_t, cred_t *, int *, int, failinfo_t *);
244 static void rinactive(rnode_t *, cred_t *);
245 static int rtablehash(nfs_fhandle *);
246 static vnode_t *make_rnode(nfs_fhandle *, rhashq_t *, struct vfs *,
247 const struct vnodeops *,
248 int (*)(vnode_t *, page_t *, uoff_t *, size_t *, int,
249 cred_t *),
250 int (*)(const void *, const void *), int *, cred_t *,
251 char *, char *);
252 static void rp_rmfree(rnode_t *);
253 static void rp_addhash(rnode_t *);
254 static void rp_rmhash_locked(rnode_t *);
255 static rnode_t *rfind(rhashq_t *, nfs_fhandle *, struct vfs *);
256 static void destroy_rnode(rnode_t *);
257 static void rddir_cache_free(rddir_cache *);
258 static int nfs_free_data_reclaim(rnode_t *);
259 static int nfs_active_data_reclaim(rnode_t *);
260 static int nfs_free_reclaim(void);
261 static int nfs_active_reclaim(void);
262 static int nfs_rnode_reclaim(void);
263 static void nfs_reclaim(void *);
264 static int failover_safe(failinfo_t *);
265 static void failover_newserver(mntinfo_t *mi);
266 static void failover_thread(mntinfo_t *mi);
267 static int failover_wait(mntinfo_t *);
268 static int failover_remap(failinfo_t *);
269 static int failover_lookup(char *, vnode_t *,
270 int (*)(vnode_t *, char *, vnode_t **,
271 struct pathname *, int, vnode_t *, cred_t *, int),
272 int (*)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
273 vnode_t **);
274 static void nfs_free_r_path(rnode_t *);
275 static void nfs_set_vroot(vnode_t *);
276 static char *nfs_getsrvnames(mntinfo_t *, size_t *);
279 * from rpcsec module (common/rpcsec)
281 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
282 extern void sec_clnt_freeh(AUTH *);
283 extern void sec_clnt_freeinfo(struct sec_data *);
286 * EIO or EINTR are not recoverable errors.
288 #define IS_RECOVERABLE_ERROR(error) !((error == EINTR) || (error == EIO))
290 #ifdef DEBUG
291 #define SRV_QFULL_MSG "send queue to NFS%d server %s is full; still trying\n"
292 #define SRV_NOTRESP_MSG "NFS%d server %s not responding still trying\n"
293 #else
294 #define SRV_QFULL_MSG "send queue to NFS server %s is full still trying\n"
295 #define SRV_NOTRESP_MSG "NFS server %s not responding still trying\n"
296 #endif
298 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
300 static int
301 clget_impl(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
302 struct chtab **chp, struct nfs_clnt *nfscl)
304 struct chhead *ch, *newch;
305 struct chhead **plistp;
306 struct chtab *cp;
307 int error;
308 k_sigset_t smask;
310 if (newcl == NULL || chp == NULL || ci == NULL)
311 return (EINVAL);
313 *newcl = NULL;
314 *chp = NULL;
317 * Find an unused handle or create one
319 newch = NULL;
320 nfscl->nfscl_stat.clgets.value.ui64++;
321 top:
323 * Find the correct entry in the cache to check for free
324 * client handles. The search is based on the RPC program
325 * number, program version number, dev_t for the transport
326 * device, and the protocol family.
328 mutex_enter(&nfscl->nfscl_chtable_lock);
329 plistp = &nfscl->nfscl_chtable;
330 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
331 if (ch->ch_prog == ci->cl_prog &&
332 ch->ch_vers == ci->cl_vers &&
333 ch->ch_dev == svp->sv_knconf->knc_rdev &&
334 (strcmp(ch->ch_protofmly,
335 svp->sv_knconf->knc_protofmly) == 0))
336 break;
337 plistp = &ch->ch_next;
341 * If we didn't find a cache entry for this quadruple, then
342 * create one. If we don't have one already preallocated,
343 * then drop the cache lock, create one, and then start over.
344 * If we did have a preallocated entry, then just add it to
345 * the front of the list.
347 if (ch == NULL) {
348 if (newch == NULL) {
349 mutex_exit(&nfscl->nfscl_chtable_lock);
350 newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
351 newch->ch_timesused = 0;
352 newch->ch_prog = ci->cl_prog;
353 newch->ch_vers = ci->cl_vers;
354 newch->ch_dev = svp->sv_knconf->knc_rdev;
355 newch->ch_protofmly = kmem_alloc(
356 strlen(svp->sv_knconf->knc_protofmly) + 1,
357 KM_SLEEP);
358 (void) strcpy(newch->ch_protofmly,
359 svp->sv_knconf->knc_protofmly);
360 newch->ch_list = NULL;
361 goto top;
363 ch = newch;
364 newch = NULL;
365 ch->ch_next = nfscl->nfscl_chtable;
366 nfscl->nfscl_chtable = ch;
368 * We found a cache entry, but if it isn't on the front of the
369 * list, then move it to the front of the list to try to take
370 * advantage of locality of operations.
372 } else if (ch != nfscl->nfscl_chtable) {
373 *plistp = ch->ch_next;
374 ch->ch_next = nfscl->nfscl_chtable;
375 nfscl->nfscl_chtable = ch;
379 * If there was a free client handle cached, then remove it
380 * from the list, init it, and use it.
382 if (ch->ch_list != NULL) {
383 cp = ch->ch_list;
384 ch->ch_list = cp->ch_list;
385 mutex_exit(&nfscl->nfscl_chtable_lock);
386 if (newch != NULL) {
387 kmem_free(newch->ch_protofmly,
388 strlen(newch->ch_protofmly) + 1);
389 kmem_free(newch, sizeof (*newch));
391 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
392 &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
393 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
394 &cp->ch_client->cl_auth);
395 if (error || cp->ch_client->cl_auth == NULL) {
396 CLNT_DESTROY(cp->ch_client);
397 kmem_cache_free(chtab_cache, cp);
398 return ((error != 0) ? error : EINTR);
400 ch->ch_timesused++;
401 *newcl = cp->ch_client;
402 *chp = cp;
403 return (0);
407 * There weren't any free client handles which fit, so allocate
408 * a new one and use that.
410 #ifdef DEBUG
411 atomic_inc_64(&nfscl->nfscl_stat.clalloc.value.ui64);
412 #endif
413 mutex_exit(&nfscl->nfscl_chtable_lock);
415 nfscl->nfscl_stat.cltoomany.value.ui64++;
416 if (newch != NULL) {
417 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
418 kmem_free(newch, sizeof (*newch));
421 cp = kmem_cache_alloc(chtab_cache, KM_SLEEP);
422 cp->ch_head = ch;
424 sigintr(&smask, (int)ci->cl_flags & MI_INT);
425 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
426 ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
427 sigunintr(&smask);
429 if (error != 0) {
430 kmem_cache_free(chtab_cache, cp);
431 #ifdef DEBUG
432 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
433 #endif
435 * Warning is unnecessary if error is EINTR.
437 if (error != EINTR) {
438 nfs_cmn_err(error, CE_WARN,
439 "clget: couldn't create handle: %m\n");
441 return (error);
443 (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
444 auth_destroy(cp->ch_client->cl_auth);
445 error = sec_clnt_geth(cp->ch_client, svp->sv_secdata, cr,
446 &cp->ch_client->cl_auth);
447 if (error || cp->ch_client->cl_auth == NULL) {
448 CLNT_DESTROY(cp->ch_client);
449 kmem_cache_free(chtab_cache, cp);
450 #ifdef DEBUG
451 atomic_dec_64(&nfscl->nfscl_stat.clalloc.value.ui64);
452 #endif
453 return ((error != 0) ? error : EINTR);
455 ch->ch_timesused++;
456 *newcl = cp->ch_client;
457 ASSERT(cp->ch_client->cl_nosignal == FALSE);
458 *chp = cp;
459 return (0);
463 clget(clinfo_t *ci, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
464 struct chtab **chp)
466 struct nfs_clnt *nfscl;
468 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
469 ASSERT(nfscl != NULL);
471 return (clget_impl(ci, svp, cr, newcl, chp, nfscl));
474 static int
475 acl_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
476 struct chtab **chp, struct nfs_clnt *nfscl)
478 clinfo_t ci;
479 int error;
482 * Set read buffer size to rsize
483 * and add room for RPC headers.
485 ci.cl_readsize = mi->mi_tsize;
486 if (ci.cl_readsize != 0)
487 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
490 * If soft mount and server is down just try once.
491 * meaning: do not retransmit.
493 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
494 ci.cl_retrans = 0;
495 else
496 ci.cl_retrans = mi->mi_retrans;
498 ci.cl_prog = NFS_ACL_PROGRAM;
499 ci.cl_vers = mi->mi_vers;
500 ci.cl_flags = mi->mi_flags;
503 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
504 * security flavor, the client tries to establish a security context
505 * by contacting the server. If the connection is timed out or reset,
506 * e.g. server reboot, we will try again.
508 do {
509 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
511 if (error == 0)
512 break;
515 * For forced unmount or zone shutdown, bail out, no retry.
517 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
518 error = EIO;
519 break;
522 /* do not retry for softmount */
523 if (!(mi->mi_flags & MI_HARD))
524 break;
526 /* let the caller deal with the failover case */
527 if (FAILOVER_MOUNT(mi))
528 break;
530 } while (error == ETIMEDOUT || error == ECONNRESET);
532 return (error);
535 static int
536 nfs_clget(mntinfo_t *mi, servinfo_t *svp, cred_t *cr, CLIENT **newcl,
537 struct chtab **chp, struct nfs_clnt *nfscl)
539 clinfo_t ci;
540 int error;
543 * Set read buffer size to rsize
544 * and add room for RPC headers.
546 ci.cl_readsize = mi->mi_tsize;
547 if (ci.cl_readsize != 0)
548 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
551 * If soft mount and server is down just try once.
552 * meaning: do not retransmit.
554 if (!(mi->mi_flags & MI_HARD) && (mi->mi_flags & MI_DOWN))
555 ci.cl_retrans = 0;
556 else
557 ci.cl_retrans = mi->mi_retrans;
559 ci.cl_prog = mi->mi_prog;
560 ci.cl_vers = mi->mi_vers;
561 ci.cl_flags = mi->mi_flags;
564 * clget calls sec_clnt_geth() to get an auth handle. For RPCSEC_GSS
565 * security flavor, the client tries to establish a security context
566 * by contacting the server. If the connection is timed out or reset,
567 * e.g. server reboot, we will try again.
569 do {
570 error = clget_impl(&ci, svp, cr, newcl, chp, nfscl);
572 if (error == 0)
573 break;
576 * For forced unmount or zone shutdown, bail out, no retry.
578 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
579 error = EIO;
580 break;
583 /* do not retry for softmount */
584 if (!(mi->mi_flags & MI_HARD))
585 break;
587 /* let the caller deal with the failover case */
588 if (FAILOVER_MOUNT(mi))
589 break;
591 } while (error == ETIMEDOUT || error == ECONNRESET);
593 return (error);
596 static void
597 clfree_impl(CLIENT *cl, struct chtab *cp, struct nfs_clnt *nfscl)
599 if (cl->cl_auth != NULL) {
600 sec_clnt_freeh(cl->cl_auth);
601 cl->cl_auth = NULL;
605 * Timestamp this cache entry so that we know when it was last
606 * used.
608 cp->ch_freed = gethrestime_sec();
611 * Add the free client handle to the front of the list.
612 * This way, the list will be sorted in youngest to oldest
613 * order.
615 mutex_enter(&nfscl->nfscl_chtable_lock);
616 cp->ch_list = cp->ch_head->ch_list;
617 cp->ch_head->ch_list = cp;
618 mutex_exit(&nfscl->nfscl_chtable_lock);
621 void
622 clfree(CLIENT *cl, struct chtab *cp)
624 struct nfs_clnt *nfscl;
626 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
627 ASSERT(nfscl != NULL);
629 clfree_impl(cl, cp, nfscl);
632 #define CL_HOLDTIME 60 /* time to hold client handles */
634 static void
635 clreclaim_zone(struct nfs_clnt *nfscl, uint_t cl_holdtime)
637 struct chhead *ch;
638 struct chtab *cp; /* list of objects that can be reclaimed */
639 struct chtab *cpe;
640 struct chtab *cpl;
641 struct chtab **cpp;
642 #ifdef DEBUG
643 int n = 0;
644 #endif
647 * Need to reclaim some memory, so step through the cache
648 * looking through the lists for entries which can be freed.
650 cp = NULL;
652 mutex_enter(&nfscl->nfscl_chtable_lock);
655 * Here we step through each non-NULL quadruple and start to
656 * construct the reclaim list pointed to by cp. Note that
657 * cp will contain all eligible chtab entries. When this traversal
658 * completes, chtab entries from the last quadruple will be at the
659 * front of cp and entries from previously inspected quadruples have
660 * been appended to the rear of cp.
662 for (ch = nfscl->nfscl_chtable; ch != NULL; ch = ch->ch_next) {
663 if (ch->ch_list == NULL)
664 continue;
666 * Search each list for entries older then
667 * cl_holdtime seconds. The lists are maintained
668 * in youngest to oldest order so that when the
669 * first entry is found which is old enough, then
670 * all of the rest of the entries on the list will
671 * be old enough as well.
673 cpl = ch->ch_list;
674 cpp = &ch->ch_list;
675 while (cpl != NULL &&
676 cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
677 cpp = &cpl->ch_list;
678 cpl = cpl->ch_list;
680 if (cpl != NULL) {
681 *cpp = NULL;
682 if (cp != NULL) {
683 cpe = cpl;
684 while (cpe->ch_list != NULL)
685 cpe = cpe->ch_list;
686 cpe->ch_list = cp;
688 cp = cpl;
692 mutex_exit(&nfscl->nfscl_chtable_lock);
695 * If cp is empty, then there is nothing to reclaim here.
697 if (cp == NULL)
698 return;
701 * Step through the list of entries to free, destroying each client
702 * handle and kmem_free'ing the memory for each entry.
704 while (cp != NULL) {
705 #ifdef DEBUG
706 n++;
707 #endif
708 CLNT_DESTROY(cp->ch_client);
709 cpl = cp->ch_list;
710 kmem_cache_free(chtab_cache, cp);
711 cp = cpl;
714 #ifdef DEBUG
716 * Update clalloc so that nfsstat shows the current number
717 * of allocated client handles.
719 atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
720 #endif
723 /* ARGSUSED */
724 static void
725 clreclaim(void *all)
727 struct nfs_clnt *nfscl;
729 #ifdef DEBUG
730 clstat_debug.clreclaim.value.ui64++;
731 #endif
733 * The system is low on memory; go through and try to reclaim some from
734 * every zone on the system.
736 mutex_enter(&nfs_clnt_list_lock);
737 nfscl = list_head(&nfs_clnt_list);
738 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl))
739 clreclaim_zone(nfscl, CL_HOLDTIME);
740 mutex_exit(&nfs_clnt_list_lock);
744 * Minimum time-out values indexed by call type
745 * These units are in "eights" of a second to avoid multiplies
747 static unsigned int minimum_timeo[] = {
748 6, 7, 10
752 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
754 #define MAXTIMO (20*hz)
755 #define backoff(tim) (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
756 #define dobackoff(tim) ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
758 #define MIN_NFS_TSIZE 512 /* minimum "chunk" of NFS IO */
759 #define REDUCE_NFS_TIME (hz/2) /* rtxcur we try to keep under */
760 #define INCREASE_NFS_TIME (hz/3*8) /* srtt we try to keep under (scaled*8) */
763 * Function called when rfscall notices that we have been
764 * re-transmitting, or when we get a response without retransmissions.
765 * Return 1 if the transfer size was adjusted down - 0 if no change.
767 static int
768 nfs_feedback(int flag, int which, mntinfo_t *mi)
770 int kind;
771 int r = 0;
773 mutex_enter(&mi->mi_lock);
774 if (flag == FEEDBACK_REXMIT1) {
775 if (mi->mi_timers[NFS_CALLTYPES].rt_rtxcur != 0 &&
776 mi->mi_timers[NFS_CALLTYPES].rt_rtxcur < REDUCE_NFS_TIME)
777 goto done;
778 if (mi->mi_curread > MIN_NFS_TSIZE) {
779 mi->mi_curread /= 2;
780 if (mi->mi_curread < MIN_NFS_TSIZE)
781 mi->mi_curread = MIN_NFS_TSIZE;
782 r = 1;
785 if (mi->mi_curwrite > MIN_NFS_TSIZE) {
786 mi->mi_curwrite /= 2;
787 if (mi->mi_curwrite < MIN_NFS_TSIZE)
788 mi->mi_curwrite = MIN_NFS_TSIZE;
789 r = 1;
791 } else if (flag == FEEDBACK_OK) {
792 kind = mi->mi_timer_type[which];
793 if (kind == 0 ||
794 mi->mi_timers[kind].rt_srtt >= INCREASE_NFS_TIME)
795 goto done;
796 if (kind == 1) {
797 if (mi->mi_curread >= mi->mi_tsize)
798 goto done;
799 mi->mi_curread += MIN_NFS_TSIZE;
800 if (mi->mi_curread > mi->mi_tsize/2)
801 mi->mi_curread = mi->mi_tsize;
802 } else if (kind == 2) {
803 if (mi->mi_curwrite >= mi->mi_stsize)
804 goto done;
805 mi->mi_curwrite += MIN_NFS_TSIZE;
806 if (mi->mi_curwrite > mi->mi_stsize/2)
807 mi->mi_curwrite = mi->mi_stsize;
810 done:
811 mutex_exit(&mi->mi_lock);
812 return (r);
815 #ifdef DEBUG
816 static int rfs2call_hits = 0;
817 static int rfs2call_misses = 0;
818 #endif
821 rfs2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
822 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
823 enum nfsstat *statusp, int flags, failinfo_t *fi)
825 int rpcerror;
826 enum clnt_stat rpc_status;
828 ASSERT(statusp != NULL);
830 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
831 cr, douprintf, &rpc_status, flags, fi);
832 if (!rpcerror) {
834 * See crnetadjust() for comments.
836 if (*statusp == NFSERR_ACCES &&
837 (cr = crnetadjust(cr)) != NULL) {
838 #ifdef DEBUG
839 rfs2call_hits++;
840 #endif
841 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres,
842 resp, cr, douprintf, NULL, flags, fi);
843 crfree(cr);
844 #ifdef DEBUG
845 if (*statusp == NFSERR_ACCES)
846 rfs2call_misses++;
847 #endif
849 } else if (rpc_status == RPC_PROCUNAVAIL) {
850 *statusp = NFSERR_OPNOTSUPP;
851 rpcerror = 0;
854 return (rpcerror);
857 #define NFS3_JUKEBOX_DELAY 10 * hz
859 static clock_t nfs3_jukebox_delay = 0;
861 #ifdef DEBUG
862 static int rfs3call_hits = 0;
863 static int rfs3call_misses = 0;
864 #endif
867 rfs3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
868 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
869 nfsstat3 *statusp, int flags, failinfo_t *fi)
871 int rpcerror;
872 int user_informed;
874 user_informed = 0;
875 do {
876 rpcerror = rfscall(mi, which, xdrargs, argsp, xdrres, resp,
877 cr, douprintf, NULL, flags, fi);
878 if (!rpcerror) {
879 cred_t *crr;
880 if (*statusp == NFS3ERR_JUKEBOX) {
881 if (ttoproc(curthread) == &p0) {
882 rpcerror = EAGAIN;
883 break;
885 if (!user_informed) {
886 user_informed = 1;
887 uprintf(
888 "file temporarily unavailable on the server, retrying...\n");
890 delay(nfs3_jukebox_delay);
893 * See crnetadjust() for comments.
895 else if (*statusp == NFS3ERR_ACCES &&
896 (crr = crnetadjust(cr)) != NULL) {
897 #ifdef DEBUG
898 rfs3call_hits++;
899 #endif
900 rpcerror = rfscall(mi, which, xdrargs, argsp,
901 xdrres, resp, crr, douprintf,
902 NULL, flags, fi);
904 crfree(crr);
905 #ifdef DEBUG
906 if (*statusp == NFS3ERR_ACCES)
907 rfs3call_misses++;
908 #endif
911 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
913 return (rpcerror);
916 #define VALID_FH(fi) (VTOR(fi->vp)->r_server == VTOMI(fi->vp)->mi_curr_serv)
917 #define INC_READERS(mi) { \
918 mi->mi_readers++; \
920 #define DEC_READERS(mi) { \
921 mi->mi_readers--; \
922 if (mi->mi_readers == 0) \
923 cv_broadcast(&mi->mi_failover_cv); \
926 static int
927 rfscall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
928 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
929 enum clnt_stat *rpc_status, int flags, failinfo_t *fi)
931 CLIENT *client;
932 struct chtab *ch;
933 cred_t *cr = icr;
934 enum clnt_stat status;
935 struct rpc_err rpcerr, rpcerr_tmp;
936 struct timeval wait;
937 int timeo; /* in units of hz */
938 int my_rsize, my_wsize;
939 bool_t tryagain;
940 bool_t cred_cloned = FALSE;
941 k_sigset_t smask;
942 servinfo_t *svp;
943 struct nfs_clnt *nfscl;
944 zoneid_t zoneid = getzoneid();
945 char *msg;
946 #ifdef DEBUG
947 char *bufp;
948 #endif
951 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
952 "rfscall_start:which %d mi %p", which, mi);
954 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
955 ASSERT(nfscl != NULL);
957 nfscl->nfscl_stat.calls.value.ui64++;
958 mi->mi_reqs[which].value.ui64++;
960 rpcerr.re_status = RPC_SUCCESS;
963 * In case of forced unmount or zone shutdown, return EIO.
966 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
967 rpcerr.re_status = RPC_FAILED;
968 rpcerr.re_errno = EIO;
969 return (rpcerr.re_errno);
973 * Remember the transfer sizes in case
974 * nfs_feedback changes them underneath us.
976 my_rsize = mi->mi_curread;
977 my_wsize = mi->mi_curwrite;
980 * NFS client failover support
982 * If this rnode is not in sync with the current server (VALID_FH),
983 * we'd like to do a remap to get in sync. We can be interrupted
984 * in failover_remap(), and if so we'll bail. Otherwise, we'll
985 * use the best info we have to try the RPC. Part of that is
986 * unconditionally updating the filehandle copy kept for V3.
988 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
989 * rw_enter(); we're trying to keep the current server from being
990 * changed on us until we're done with the remapping and have a
991 * matching client handle. We don't want to sending a filehandle
992 * to the wrong host.
994 failoverretry:
995 if (FAILOVER_MOUNT(mi)) {
996 mutex_enter(&mi->mi_lock);
997 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
998 if (failover_wait(mi)) {
999 mutex_exit(&mi->mi_lock);
1000 return (EINTR);
1003 INC_READERS(mi);
1004 mutex_exit(&mi->mi_lock);
1005 if (fi) {
1006 if (!VALID_FH(fi) &&
1007 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1008 int remaperr;
1010 svp = mi->mi_curr_serv;
1011 remaperr = failover_remap(fi);
1012 if (remaperr != 0) {
1013 #ifdef DEBUG
1014 if (remaperr != EINTR)
1015 nfs_cmn_err(remaperr, CE_WARN,
1016 "rfscall couldn't failover: %m");
1017 #endif
1018 mutex_enter(&mi->mi_lock);
1019 DEC_READERS(mi);
1020 mutex_exit(&mi->mi_lock);
1022 * If failover_remap returns ETIMEDOUT
1023 * and the filesystem is hard mounted
1024 * we have to retry the call with a new
1025 * server.
1027 if ((mi->mi_flags & MI_HARD) &&
1028 IS_RECOVERABLE_ERROR(remaperr)) {
1029 if (svp == mi->mi_curr_serv)
1030 failover_newserver(mi);
1031 rpcerr.re_status = RPC_SUCCESS;
1032 goto failoverretry;
1034 rpcerr.re_errno = remaperr;
1035 return (remaperr);
1038 if (fi->fhp && fi->copyproc)
1039 (*fi->copyproc)(fi->fhp, fi->vp);
1044 * clget() calls clnt_tli_kinit() which clears the xid, so we
1045 * are guaranteed to reprocess the retry as a new request.
1047 svp = mi->mi_curr_serv;
1048 rpcerr.re_errno = nfs_clget(mi, svp, cr, &client, &ch, nfscl);
1050 if (FAILOVER_MOUNT(mi)) {
1051 mutex_enter(&mi->mi_lock);
1052 DEC_READERS(mi);
1053 mutex_exit(&mi->mi_lock);
1055 if ((rpcerr.re_errno == ETIMEDOUT ||
1056 rpcerr.re_errno == ECONNRESET) &&
1057 failover_safe(fi)) {
1058 if (svp == mi->mi_curr_serv)
1059 failover_newserver(mi);
1060 goto failoverretry;
1063 if (rpcerr.re_errno != 0)
1064 return (rpcerr.re_errno);
1066 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1067 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1068 timeo = (mi->mi_timeo * hz) / 10;
1069 } else {
1070 mutex_enter(&mi->mi_lock);
1071 timeo = CLNT_SETTIMERS(client,
1072 &(mi->mi_timers[mi->mi_timer_type[which]]),
1073 &(mi->mi_timers[NFS_CALLTYPES]),
1074 (minimum_timeo[mi->mi_call_type[which]]*hz)>>3,
1075 (void (*)())NULL, (caddr_t)mi, 0);
1076 mutex_exit(&mi->mi_lock);
1080 * If hard mounted fs, retry call forever unless hard error occurs.
1082 do {
1083 tryagain = FALSE;
1085 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1086 status = RPC_FAILED;
1087 rpcerr.re_status = RPC_FAILED;
1088 rpcerr.re_errno = EIO;
1089 break;
1092 TICK_TO_TIMEVAL(timeo, &wait);
1095 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1096 * and SIGTERM. (Preserving the existing masks).
1097 * Mask out SIGINT if mount option nointr is specified.
1099 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1100 if (!(mi->mi_flags & MI_INT))
1101 client->cl_nosignal = TRUE;
1104 * If there is a current signal, then don't bother
1105 * even trying to send out the request because we
1106 * won't be able to block waiting for the response.
1107 * Simply assume RPC_INTR and get on with it.
1109 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1110 status = RPC_INTR;
1111 else {
1112 status = CLNT_CALL(client, which, xdrargs, argsp,
1113 xdrres, resp, wait);
1116 if (!(mi->mi_flags & MI_INT))
1117 client->cl_nosignal = FALSE;
1119 * restore original signal mask
1121 sigunintr(&smask);
1123 switch (status) {
1124 case RPC_SUCCESS:
1125 if ((mi->mi_flags & MI_DYNAMIC) &&
1126 mi->mi_timer_type[which] != 0 &&
1127 (mi->mi_curread != my_rsize ||
1128 mi->mi_curwrite != my_wsize))
1129 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1130 break;
1132 case RPC_INTR:
1134 * There is no way to recover from this error,
1135 * even if mount option nointr is specified.
1136 * SIGKILL, for example, cannot be blocked.
1138 rpcerr.re_status = RPC_INTR;
1139 rpcerr.re_errno = EINTR;
1140 break;
1142 case RPC_UDERROR:
1144 * If the NFS server is local (vold) and
1145 * it goes away then we get RPC_UDERROR.
1146 * This is a retryable error, so we would
1147 * loop, so check to see if the specific
1148 * error was ECONNRESET, indicating that
1149 * target did not exist at all. If so,
1150 * return with RPC_PROGUNAVAIL and
1151 * ECONNRESET to indicate why.
1153 CLNT_GETERR(client, &rpcerr);
1154 if (rpcerr.re_errno == ECONNRESET) {
1155 rpcerr.re_status = RPC_PROGUNAVAIL;
1156 rpcerr.re_errno = ECONNRESET;
1157 break;
1159 /*FALLTHROUGH*/
1161 default: /* probably RPC_TIMEDOUT */
1162 if (IS_UNRECOVERABLE_RPC(status))
1163 break;
1166 * increment server not responding count
1168 mutex_enter(&mi->mi_lock);
1169 mi->mi_noresponse++;
1170 mutex_exit(&mi->mi_lock);
1171 #ifdef DEBUG
1172 nfscl->nfscl_stat.noresponse.value.ui64++;
1173 #endif
1175 if (!(mi->mi_flags & MI_HARD)) {
1176 if (!(mi->mi_flags & MI_SEMISOFT) ||
1177 (mi->mi_ss_call_type[which] == 0))
1178 break;
1182 * The call is in progress (over COTS).
1183 * Try the CLNT_CALL again, but don't
1184 * print a noisy error message.
1186 if (status == RPC_INPROGRESS) {
1187 tryagain = TRUE;
1188 break;
1191 if (flags & RFSCALL_SOFT)
1192 break;
1195 * On zone shutdown, just move on.
1197 if (zone_status_get(curproc->p_zone) >=
1198 ZONE_IS_SHUTTING_DOWN) {
1199 rpcerr.re_status = RPC_FAILED;
1200 rpcerr.re_errno = EIO;
1201 break;
1205 * NFS client failover support
1207 * If the current server just failed us, we'll
1208 * start the process of finding a new server.
1209 * After that, we can just retry.
1211 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1212 if (svp == mi->mi_curr_serv)
1213 failover_newserver(mi);
1214 clfree_impl(client, ch, nfscl);
1215 goto failoverretry;
1218 tryagain = TRUE;
1219 timeo = backoff(timeo);
1221 CLNT_GETERR(client, &rpcerr_tmp);
1222 if ((status == RPC_CANTSEND) &&
1223 (rpcerr_tmp.re_errno == ENOBUFS))
1224 msg = SRV_QFULL_MSG;
1225 else
1226 msg = SRV_NOTRESP_MSG;
1228 mutex_enter(&mi->mi_lock);
1229 if (!(mi->mi_flags & MI_PRINTED)) {
1230 mi->mi_flags |= MI_PRINTED;
1231 mutex_exit(&mi->mi_lock);
1232 #ifdef DEBUG
1233 zprintf(zoneid, msg, mi->mi_vers,
1234 svp->sv_hostname);
1235 #else
1236 zprintf(zoneid, msg, svp->sv_hostname);
1237 #endif
1238 } else
1239 mutex_exit(&mi->mi_lock);
1240 if (*douprintf && nfs_has_ctty()) {
1241 *douprintf = 0;
1242 if (!(mi->mi_flags & MI_NOPRINT))
1243 #ifdef DEBUG
1244 uprintf(msg, mi->mi_vers,
1245 svp->sv_hostname);
1246 #else
1247 uprintf(msg, svp->sv_hostname);
1248 #endif
1252 * If doing dynamic adjustment of transfer
1253 * size and if it's a read or write call
1254 * and if the transfer size changed while
1255 * retransmitting or if the feedback routine
1256 * changed the transfer size,
1257 * then exit rfscall so that the transfer
1258 * size can be adjusted at the vnops level.
1260 if ((mi->mi_flags & MI_DYNAMIC) &&
1261 mi->mi_timer_type[which] != 0 &&
1262 (mi->mi_curread != my_rsize ||
1263 mi->mi_curwrite != my_wsize ||
1264 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1266 * On read or write calls, return
1267 * back to the vnode ops level if
1268 * the transfer size changed.
1270 clfree_impl(client, ch, nfscl);
1271 if (cred_cloned)
1272 crfree(cr);
1273 return (ENFS_TRYAGAIN);
1276 } while (tryagain);
1278 if (status != RPC_SUCCESS) {
1280 * Let soft mounts use the timed out message.
1282 if (status == RPC_INPROGRESS)
1283 status = RPC_TIMEDOUT;
1284 nfscl->nfscl_stat.badcalls.value.ui64++;
1285 if (status != RPC_INTR) {
1286 mutex_enter(&mi->mi_lock);
1287 mi->mi_flags |= MI_DOWN;
1288 mutex_exit(&mi->mi_lock);
1289 CLNT_GETERR(client, &rpcerr);
1290 #ifdef DEBUG
1291 bufp = clnt_sperror(client, svp->sv_hostname);
1292 zprintf(zoneid, "NFS%d %s failed for %s\n",
1293 mi->mi_vers, mi->mi_rfsnames[which], bufp);
1294 if (nfs_has_ctty()) {
1295 if (!(mi->mi_flags & MI_NOPRINT)) {
1296 uprintf("NFS%d %s failed for %s\n",
1297 mi->mi_vers, mi->mi_rfsnames[which],
1298 bufp);
1301 kmem_free(bufp, MAXPATHLEN);
1302 #else
1303 zprintf(zoneid,
1304 "NFS %s failed for server %s: error %d (%s)\n",
1305 mi->mi_rfsnames[which], svp->sv_hostname,
1306 status, clnt_sperrno(status));
1307 if (nfs_has_ctty()) {
1308 if (!(mi->mi_flags & MI_NOPRINT)) {
1309 uprintf(
1310 "NFS %s failed for server %s: error %d (%s)\n",
1311 mi->mi_rfsnames[which],
1312 svp->sv_hostname, status,
1313 clnt_sperrno(status));
1316 #endif
1318 * when CLNT_CALL() fails with RPC_AUTHERROR,
1319 * re_errno is set appropriately depending on
1320 * the authentication error
1322 if (status == RPC_VERSMISMATCH ||
1323 status == RPC_PROGVERSMISMATCH)
1324 rpcerr.re_errno = EIO;
1326 } else {
1328 * Test the value of mi_down and mi_printed without
1329 * holding the mi_lock mutex. If they are both zero,
1330 * then it is okay to skip the down and printed
1331 * processing. This saves on a mutex_enter and
1332 * mutex_exit pair for a normal, successful RPC.
1333 * This was just complete overhead.
1335 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1336 mutex_enter(&mi->mi_lock);
1337 mi->mi_flags &= ~MI_DOWN;
1338 if (mi->mi_flags & MI_PRINTED) {
1339 mi->mi_flags &= ~MI_PRINTED;
1340 mutex_exit(&mi->mi_lock);
1341 #ifdef DEBUG
1342 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1343 zprintf(zoneid, "NFS%d server %s ok\n",
1344 mi->mi_vers, svp->sv_hostname);
1345 #else
1346 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1347 zprintf(zoneid, "NFS server %s ok\n",
1348 svp->sv_hostname);
1349 #endif
1350 } else
1351 mutex_exit(&mi->mi_lock);
1354 if (*douprintf == 0) {
1355 if (!(mi->mi_flags & MI_NOPRINT))
1356 #ifdef DEBUG
1357 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1358 uprintf("NFS%d server %s ok\n",
1359 mi->mi_vers, svp->sv_hostname);
1360 #else
1361 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1362 uprintf("NFS server %s ok\n", svp->sv_hostname);
1363 #endif
1364 *douprintf = 1;
1368 clfree_impl(client, ch, nfscl);
1369 if (cred_cloned)
1370 crfree(cr);
1372 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1374 if (rpc_status != NULL)
1375 *rpc_status = rpcerr.re_status;
1377 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1378 rpcerr.re_errno);
1380 return (rpcerr.re_errno);
1383 #ifdef DEBUG
1384 static int acl2call_hits = 0;
1385 static int acl2call_misses = 0;
1386 #endif
1389 acl2call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1390 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1391 enum nfsstat *statusp, int flags, failinfo_t *fi)
1393 int rpcerror;
1395 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1396 cr, douprintf, flags, fi);
1397 if (!rpcerror) {
1399 * See comments with crnetadjust().
1401 if (*statusp == NFSERR_ACCES &&
1402 (cr = crnetadjust(cr)) != NULL) {
1403 #ifdef DEBUG
1404 acl2call_hits++;
1405 #endif
1406 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres,
1407 resp, cr, douprintf, flags, fi);
1408 crfree(cr);
1409 #ifdef DEBUG
1410 if (*statusp == NFSERR_ACCES)
1411 acl2call_misses++;
1412 #endif
1416 return (rpcerror);
1419 #ifdef DEBUG
1420 static int acl3call_hits = 0;
1421 static int acl3call_misses = 0;
1422 #endif
1425 acl3call(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1426 xdrproc_t xdrres, caddr_t resp, cred_t *cr, int *douprintf,
1427 nfsstat3 *statusp, int flags, failinfo_t *fi)
1429 int rpcerror;
1430 int user_informed;
1432 user_informed = 0;
1434 do {
1435 rpcerror = aclcall(mi, which, xdrargs, argsp, xdrres, resp,
1436 cr, douprintf, flags, fi);
1437 if (!rpcerror) {
1438 cred_t *crr;
1439 if (*statusp == NFS3ERR_JUKEBOX) {
1440 if (!user_informed) {
1441 user_informed = 1;
1442 uprintf(
1443 "file temporarily unavailable on the server, retrying...\n");
1445 delay(nfs3_jukebox_delay);
1448 * See crnetadjust() for comments.
1450 else if (*statusp == NFS3ERR_ACCES &&
1451 (crr = crnetadjust(cr)) != NULL) {
1452 #ifdef DEBUG
1453 acl3call_hits++;
1454 #endif
1455 rpcerror = aclcall(mi, which, xdrargs, argsp,
1456 xdrres, resp, crr, douprintf, flags, fi);
1458 crfree(crr);
1459 #ifdef DEBUG
1460 if (*statusp == NFS3ERR_ACCES)
1461 acl3call_misses++;
1462 #endif
1465 } while (!rpcerror && *statusp == NFS3ERR_JUKEBOX);
1467 return (rpcerror);
1470 static int
1471 aclcall(mntinfo_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1472 xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *douprintf,
1473 int flags, failinfo_t *fi)
1475 CLIENT *client;
1476 struct chtab *ch;
1477 cred_t *cr = icr;
1478 bool_t cred_cloned = FALSE;
1479 enum clnt_stat status;
1480 struct rpc_err rpcerr;
1481 struct timeval wait;
1482 int timeo; /* in units of hz */
1483 #if 0 /* notyet */
1484 int my_rsize, my_wsize;
1485 #endif
1486 bool_t tryagain;
1487 k_sigset_t smask;
1488 servinfo_t *svp;
1489 struct nfs_clnt *nfscl;
1490 zoneid_t zoneid = getzoneid();
1491 #ifdef DEBUG
1492 char *bufp;
1493 #endif
1495 #if 0 /* notyet */
1496 TRACE_2(TR_FAC_NFS, TR_RFSCALL_START,
1497 "rfscall_start:which %d mi %p", which, mi);
1498 #endif
1500 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
1501 ASSERT(nfscl != NULL);
1503 nfscl->nfscl_stat.calls.value.ui64++;
1504 mi->mi_aclreqs[which].value.ui64++;
1506 rpcerr.re_status = RPC_SUCCESS;
1508 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1509 rpcerr.re_status = RPC_FAILED;
1510 rpcerr.re_errno = EIO;
1511 return (rpcerr.re_errno);
1514 #if 0 /* notyet */
1516 * Remember the transfer sizes in case
1517 * nfs_feedback changes them underneath us.
1519 my_rsize = mi->mi_curread;
1520 my_wsize = mi->mi_curwrite;
1521 #endif
1524 * NFS client failover support
1526 * If this rnode is not in sync with the current server (VALID_FH),
1527 * we'd like to do a remap to get in sync. We can be interrupted
1528 * in failover_remap(), and if so we'll bail. Otherwise, we'll
1529 * use the best info we have to try the RPC. Part of that is
1530 * unconditionally updating the filehandle copy kept for V3.
1532 * Locking: INC_READERS/DEC_READERS is a poor man's interrruptible
1533 * rw_enter(); we're trying to keep the current server from being
1534 * changed on us until we're done with the remapping and have a
1535 * matching client handle. We don't want to sending a filehandle
1536 * to the wrong host.
1538 failoverretry:
1539 if (FAILOVER_MOUNT(mi)) {
1540 mutex_enter(&mi->mi_lock);
1541 if (!(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1542 if (failover_wait(mi)) {
1543 mutex_exit(&mi->mi_lock);
1544 return (EINTR);
1547 INC_READERS(mi);
1548 mutex_exit(&mi->mi_lock);
1549 if (fi) {
1550 if (!VALID_FH(fi) &&
1551 !(flags & RFSCALL_SOFT) && failover_safe(fi)) {
1552 int remaperr;
1554 svp = mi->mi_curr_serv;
1555 remaperr = failover_remap(fi);
1556 if (remaperr != 0) {
1557 #ifdef DEBUG
1558 if (remaperr != EINTR)
1559 nfs_cmn_err(remaperr, CE_WARN,
1560 "aclcall couldn't failover: %m");
1561 #endif
1562 mutex_enter(&mi->mi_lock);
1563 DEC_READERS(mi);
1564 mutex_exit(&mi->mi_lock);
1567 * If failover_remap returns ETIMEDOUT
1568 * and the filesystem is hard mounted
1569 * we have to retry the call with a new
1570 * server.
1572 if ((mi->mi_flags & MI_HARD) &&
1573 IS_RECOVERABLE_ERROR(remaperr)) {
1574 if (svp == mi->mi_curr_serv)
1575 failover_newserver(mi);
1576 rpcerr.re_status = RPC_SUCCESS;
1577 goto failoverretry;
1579 return (remaperr);
1582 if (fi->fhp && fi->copyproc)
1583 (*fi->copyproc)(fi->fhp, fi->vp);
1588 * acl_clget() calls clnt_tli_kinit() which clears the xid, so we
1589 * are guaranteed to reprocess the retry as a new request.
1591 svp = mi->mi_curr_serv;
1592 rpcerr.re_errno = acl_clget(mi, svp, cr, &client, &ch, nfscl);
1593 if (FAILOVER_MOUNT(mi)) {
1594 mutex_enter(&mi->mi_lock);
1595 DEC_READERS(mi);
1596 mutex_exit(&mi->mi_lock);
1598 if ((rpcerr.re_errno == ETIMEDOUT ||
1599 rpcerr.re_errno == ECONNRESET) &&
1600 failover_safe(fi)) {
1601 if (svp == mi->mi_curr_serv)
1602 failover_newserver(mi);
1603 goto failoverretry;
1606 if (rpcerr.re_errno != 0) {
1607 if (cred_cloned)
1608 crfree(cr);
1609 return (rpcerr.re_errno);
1612 if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
1613 svp->sv_knconf->knc_semantics == NC_TPI_COTS) {
1614 timeo = (mi->mi_timeo * hz) / 10;
1615 } else {
1616 mutex_enter(&mi->mi_lock);
1617 timeo = CLNT_SETTIMERS(client,
1618 &(mi->mi_timers[mi->mi_acl_timer_type[which]]),
1619 &(mi->mi_timers[NFS_CALLTYPES]),
1620 (minimum_timeo[mi->mi_acl_call_type[which]]*hz)>>3,
1621 (void (*)()) 0, (caddr_t)mi, 0);
1622 mutex_exit(&mi->mi_lock);
1626 * If hard mounted fs, retry call forever unless hard error occurs.
1628 do {
1629 tryagain = FALSE;
1631 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
1632 status = RPC_FAILED;
1633 rpcerr.re_status = RPC_FAILED;
1634 rpcerr.re_errno = EIO;
1635 break;
1638 TICK_TO_TIMEVAL(timeo, &wait);
1641 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1642 * and SIGTERM. (Preserving the existing masks).
1643 * Mask out SIGINT if mount option nointr is specified.
1645 sigintr(&smask, (int)mi->mi_flags & MI_INT);
1646 if (!(mi->mi_flags & MI_INT))
1647 client->cl_nosignal = TRUE;
1650 * If there is a current signal, then don't bother
1651 * even trying to send out the request because we
1652 * won't be able to block waiting for the response.
1653 * Simply assume RPC_INTR and get on with it.
1655 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1656 status = RPC_INTR;
1657 else {
1658 status = CLNT_CALL(client, which, xdrargs, argsp,
1659 xdrres, resp, wait);
1662 if (!(mi->mi_flags & MI_INT))
1663 client->cl_nosignal = FALSE;
1665 * restore original signal mask
1667 sigunintr(&smask);
1669 switch (status) {
1670 case RPC_SUCCESS:
1671 #if 0 /* notyet */
1672 if ((mi->mi_flags & MI_DYNAMIC) &&
1673 mi->mi_timer_type[which] != 0 &&
1674 (mi->mi_curread != my_rsize ||
1675 mi->mi_curwrite != my_wsize))
1676 (void) nfs_feedback(FEEDBACK_OK, which, mi);
1677 #endif
1678 break;
1681 * Unfortunately, there are servers in the world which
1682 * are not coded correctly. They are not prepared to
1683 * handle RPC requests to the NFS port which are not
1684 * NFS requests. Thus, they may try to process the
1685 * NFS_ACL request as if it were an NFS request. This
1686 * does not work. Generally, an error will be generated
1687 * on the client because it will not be able to decode
1688 * the response from the server. However, it seems
1689 * possible that the server may not be able to decode
1690 * the arguments. Thus, the criteria for deciding
1691 * whether the server supports NFS_ACL or not is whether
1692 * the following RPC errors are returned from CLNT_CALL.
1694 case RPC_CANTDECODERES:
1695 case RPC_PROGUNAVAIL:
1696 case RPC_CANTDECODEARGS:
1697 case RPC_PROGVERSMISMATCH:
1698 mutex_enter(&mi->mi_lock);
1699 mi->mi_flags &= ~(MI_ACL | MI_EXTATTR);
1700 mutex_exit(&mi->mi_lock);
1701 break;
1704 * If the server supports NFS_ACL but not the new ops
1705 * for extended attributes, make sure we don't retry.
1707 case RPC_PROCUNAVAIL:
1708 mutex_enter(&mi->mi_lock);
1709 mi->mi_flags &= ~MI_EXTATTR;
1710 mutex_exit(&mi->mi_lock);
1711 break;
1713 case RPC_INTR:
1715 * There is no way to recover from this error,
1716 * even if mount option nointr is specified.
1717 * SIGKILL, for example, cannot be blocked.
1719 rpcerr.re_status = RPC_INTR;
1720 rpcerr.re_errno = EINTR;
1721 break;
1723 case RPC_UDERROR:
1725 * If the NFS server is local (vold) and
1726 * it goes away then we get RPC_UDERROR.
1727 * This is a retryable error, so we would
1728 * loop, so check to see if the specific
1729 * error was ECONNRESET, indicating that
1730 * target did not exist at all. If so,
1731 * return with RPC_PROGUNAVAIL and
1732 * ECONNRESET to indicate why.
1734 CLNT_GETERR(client, &rpcerr);
1735 if (rpcerr.re_errno == ECONNRESET) {
1736 rpcerr.re_status = RPC_PROGUNAVAIL;
1737 rpcerr.re_errno = ECONNRESET;
1738 break;
1740 /*FALLTHROUGH*/
1742 default: /* probably RPC_TIMEDOUT */
1743 if (IS_UNRECOVERABLE_RPC(status))
1744 break;
1747 * increment server not responding count
1749 mutex_enter(&mi->mi_lock);
1750 mi->mi_noresponse++;
1751 mutex_exit(&mi->mi_lock);
1752 #ifdef DEBUG
1753 nfscl->nfscl_stat.noresponse.value.ui64++;
1754 #endif
1756 if (!(mi->mi_flags & MI_HARD)) {
1757 if (!(mi->mi_flags & MI_SEMISOFT) ||
1758 (mi->mi_acl_ss_call_type[which] == 0))
1759 break;
1763 * The call is in progress (over COTS).
1764 * Try the CLNT_CALL again, but don't
1765 * print a noisy error message.
1767 if (status == RPC_INPROGRESS) {
1768 tryagain = TRUE;
1769 break;
1772 if (flags & RFSCALL_SOFT)
1773 break;
1776 * On zone shutdown, just move on.
1778 if (zone_status_get(curproc->p_zone) >=
1779 ZONE_IS_SHUTTING_DOWN) {
1780 rpcerr.re_status = RPC_FAILED;
1781 rpcerr.re_errno = EIO;
1782 break;
1786 * NFS client failover support
1788 * If the current server just failed us, we'll
1789 * start the process of finding a new server.
1790 * After that, we can just retry.
1792 if (FAILOVER_MOUNT(mi) && failover_safe(fi)) {
1793 if (svp == mi->mi_curr_serv)
1794 failover_newserver(mi);
1795 clfree_impl(client, ch, nfscl);
1796 goto failoverretry;
1799 tryagain = TRUE;
1800 timeo = backoff(timeo);
1801 mutex_enter(&mi->mi_lock);
1802 if (!(mi->mi_flags & MI_PRINTED)) {
1803 mi->mi_flags |= MI_PRINTED;
1804 mutex_exit(&mi->mi_lock);
1805 #ifdef DEBUG
1806 zprintf(zoneid,
1807 "NFS_ACL%d server %s not responding still trying\n",
1808 mi->mi_vers, svp->sv_hostname);
1809 #else
1810 zprintf(zoneid,
1811 "NFS server %s not responding still trying\n",
1812 svp->sv_hostname);
1813 #endif
1814 } else
1815 mutex_exit(&mi->mi_lock);
1816 if (*douprintf && nfs_has_ctty()) {
1817 *douprintf = 0;
1818 if (!(mi->mi_flags & MI_NOPRINT))
1819 #ifdef DEBUG
1820 uprintf(
1821 "NFS_ACL%d server %s not responding still trying\n",
1822 mi->mi_vers, svp->sv_hostname);
1823 #else
1824 uprintf(
1825 "NFS server %s not responding still trying\n",
1826 svp->sv_hostname);
1827 #endif
1830 #if 0 /* notyet */
1832 * If doing dynamic adjustment of transfer
1833 * size and if it's a read or write call
1834 * and if the transfer size changed while
1835 * retransmitting or if the feedback routine
1836 * changed the transfer size,
1837 * then exit rfscall so that the transfer
1838 * size can be adjusted at the vnops level.
1840 if ((mi->mi_flags & MI_DYNAMIC) &&
1841 mi->mi_acl_timer_type[which] != 0 &&
1842 (mi->mi_curread != my_rsize ||
1843 mi->mi_curwrite != my_wsize ||
1844 nfs_feedback(FEEDBACK_REXMIT1, which, mi))) {
1846 * On read or write calls, return
1847 * back to the vnode ops level if
1848 * the transfer size changed.
1850 clfree_impl(client, ch, nfscl);
1851 if (cred_cloned)
1852 crfree(cr);
1853 return (ENFS_TRYAGAIN);
1855 #endif
1857 } while (tryagain);
1859 if (status != RPC_SUCCESS) {
1861 * Let soft mounts use the timed out message.
1863 if (status == RPC_INPROGRESS)
1864 status = RPC_TIMEDOUT;
1865 nfscl->nfscl_stat.badcalls.value.ui64++;
1866 if (status == RPC_CANTDECODERES ||
1867 status == RPC_PROGUNAVAIL ||
1868 status == RPC_PROCUNAVAIL ||
1869 status == RPC_CANTDECODEARGS ||
1870 status == RPC_PROGVERSMISMATCH)
1871 CLNT_GETERR(client, &rpcerr);
1872 else if (status != RPC_INTR) {
1873 mutex_enter(&mi->mi_lock);
1874 mi->mi_flags |= MI_DOWN;
1875 mutex_exit(&mi->mi_lock);
1876 CLNT_GETERR(client, &rpcerr);
1877 #ifdef DEBUG
1878 bufp = clnt_sperror(client, svp->sv_hostname);
1879 zprintf(zoneid, "NFS_ACL%d %s failed for %s\n",
1880 mi->mi_vers, mi->mi_aclnames[which], bufp);
1881 if (nfs_has_ctty()) {
1882 if (!(mi->mi_flags & MI_NOPRINT)) {
1883 uprintf("NFS_ACL%d %s failed for %s\n",
1884 mi->mi_vers, mi->mi_aclnames[which],
1885 bufp);
1888 kmem_free(bufp, MAXPATHLEN);
1889 #else
1890 zprintf(zoneid,
1891 "NFS %s failed for server %s: error %d (%s)\n",
1892 mi->mi_aclnames[which], svp->sv_hostname,
1893 status, clnt_sperrno(status));
1894 if (nfs_has_ctty()) {
1895 if (!(mi->mi_flags & MI_NOPRINT))
1896 uprintf(
1897 "NFS %s failed for server %s: error %d (%s)\n",
1898 mi->mi_aclnames[which],
1899 svp->sv_hostname, status,
1900 clnt_sperrno(status));
1902 #endif
1904 * when CLNT_CALL() fails with RPC_AUTHERROR,
1905 * re_errno is set appropriately depending on
1906 * the authentication error
1908 if (status == RPC_VERSMISMATCH ||
1909 status == RPC_PROGVERSMISMATCH)
1910 rpcerr.re_errno = EIO;
1912 } else {
1914 * Test the value of mi_down and mi_printed without
1915 * holding the mi_lock mutex. If they are both zero,
1916 * then it is okay to skip the down and printed
1917 * processing. This saves on a mutex_enter and
1918 * mutex_exit pair for a normal, successful RPC.
1919 * This was just complete overhead.
1921 if (mi->mi_flags & (MI_DOWN | MI_PRINTED)) {
1922 mutex_enter(&mi->mi_lock);
1923 mi->mi_flags &= ~MI_DOWN;
1924 if (mi->mi_flags & MI_PRINTED) {
1925 mi->mi_flags &= ~MI_PRINTED;
1926 mutex_exit(&mi->mi_lock);
1927 #ifdef DEBUG
1928 zprintf(zoneid, "NFS_ACL%d server %s ok\n",
1929 mi->mi_vers, svp->sv_hostname);
1930 #else
1931 zprintf(zoneid, "NFS server %s ok\n",
1932 svp->sv_hostname);
1933 #endif
1934 } else
1935 mutex_exit(&mi->mi_lock);
1938 if (*douprintf == 0) {
1939 if (!(mi->mi_flags & MI_NOPRINT))
1940 #ifdef DEBUG
1941 uprintf("NFS_ACL%d server %s ok\n",
1942 mi->mi_vers, svp->sv_hostname);
1943 #else
1944 uprintf("NFS server %s ok\n", svp->sv_hostname);
1945 #endif
1946 *douprintf = 1;
1950 clfree_impl(client, ch, nfscl);
1951 if (cred_cloned)
1952 crfree(cr);
1954 ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1956 #if 0 /* notyet */
1957 TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "rfscall_end:errno %d",
1958 rpcerr.re_errno);
1959 #endif
1961 return (rpcerr.re_errno);
1965 vattr_to_sattr(struct vattr *vap, struct nfssattr *sa)
1967 uint_t mask = vap->va_mask;
1969 if (!(mask & VATTR_MODE))
1970 sa->sa_mode = (uint32_t)-1;
1971 else
1972 sa->sa_mode = vap->va_mode;
1973 if (!(mask & VATTR_UID))
1974 sa->sa_uid = (uint32_t)-1;
1975 else
1976 sa->sa_uid = (uint32_t)vap->va_uid;
1977 if (!(mask & VATTR_GID))
1978 sa->sa_gid = (uint32_t)-1;
1979 else
1980 sa->sa_gid = (uint32_t)vap->va_gid;
1981 if (!(mask & VATTR_SIZE))
1982 sa->sa_size = (uint32_t)-1;
1983 else
1984 sa->sa_size = (uint32_t)vap->va_size;
1985 if (!(mask & VATTR_ATIME))
1986 sa->sa_atime.tv_sec = sa->sa_atime.tv_usec = (int32_t)-1;
1987 else {
1988 /* check time validity */
1989 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
1990 return (EOVERFLOW);
1992 sa->sa_atime.tv_sec = vap->va_atime.tv_sec;
1993 sa->sa_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
1995 if (!(mask & VATTR_MTIME))
1996 sa->sa_mtime.tv_sec = sa->sa_mtime.tv_usec = (int32_t)-1;
1997 else {
1998 /* check time validity */
1999 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2000 return (EOVERFLOW);
2002 sa->sa_mtime.tv_sec = vap->va_mtime.tv_sec;
2003 sa->sa_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2005 return (0);
2009 vattr_to_sattr3(struct vattr *vap, sattr3 *sa)
2011 uint_t mask = vap->va_mask;
2013 if (!(mask & VATTR_MODE))
2014 sa->mode.set_it = FALSE;
2015 else {
2016 sa->mode.set_it = TRUE;
2017 sa->mode.mode = (mode3)vap->va_mode;
2019 if (!(mask & VATTR_UID))
2020 sa->uid.set_it = FALSE;
2021 else {
2022 sa->uid.set_it = TRUE;
2023 sa->uid.uid = (uid3)vap->va_uid;
2025 if (!(mask & VATTR_GID))
2026 sa->gid.set_it = FALSE;
2027 else {
2028 sa->gid.set_it = TRUE;
2029 sa->gid.gid = (gid3)vap->va_gid;
2031 if (!(mask & VATTR_SIZE))
2032 sa->size.set_it = FALSE;
2033 else {
2034 sa->size.set_it = TRUE;
2035 sa->size.size = (size3)vap->va_size;
2037 if (!(mask & VATTR_ATIME))
2038 sa->atime.set_it = DONT_CHANGE;
2039 else {
2040 /* check time validity */
2041 if (! NFS_TIME_T_OK(vap->va_atime.tv_sec)) {
2042 return (EOVERFLOW);
2044 sa->atime.set_it = SET_TO_CLIENT_TIME;
2045 sa->atime.atime.seconds = (uint32)vap->va_atime.tv_sec;
2046 sa->atime.atime.nseconds = (uint32)vap->va_atime.tv_nsec;
2048 if (!(mask & VATTR_MTIME))
2049 sa->mtime.set_it = DONT_CHANGE;
2050 else {
2051 /* check time validity */
2052 if (! NFS_TIME_T_OK(vap->va_mtime.tv_sec)) {
2053 return (EOVERFLOW);
2055 sa->mtime.set_it = SET_TO_CLIENT_TIME;
2056 sa->mtime.mtime.seconds = (uint32)vap->va_mtime.tv_sec;
2057 sa->mtime.mtime.nseconds = (uint32)vap->va_mtime.tv_nsec;
2059 return (0);
2062 void
2063 setdiropargs(struct nfsdiropargs *da, char *nm, vnode_t *dvp)
2066 da->da_fhandle = VTOFH(dvp);
2067 da->da_name = nm;
2068 da->da_flags = 0;
2071 void
2072 setdiropargs3(diropargs3 *da, char *nm, vnode_t *dvp)
2075 da->dirp = VTOFH3(dvp);
2076 da->name = nm;
2080 setdirgid(vnode_t *dvp, gid_t *gidp, cred_t *cr)
2082 int error;
2083 rnode_t *rp;
2084 struct vattr va;
2086 va.va_mask = VATTR_MODE | VATTR_GID;
2087 error = fop_getattr(dvp, &va, 0, cr, NULL);
2088 if (error)
2089 return (error);
2092 * To determine the expected group-id of the created file:
2093 * 1) If the filesystem was not mounted with the Old-BSD-compatible
2094 * GRPID option, and the directory's set-gid bit is clear,
2095 * then use the process's gid.
2096 * 2) Otherwise, set the group-id to the gid of the parent directory.
2098 rp = VTOR(dvp);
2099 mutex_enter(&rp->r_statelock);
2100 if (!(VTOMI(dvp)->mi_flags & MI_GRPID) && !(va.va_mode & VSGID))
2101 *gidp = crgetgid(cr);
2102 else
2103 *gidp = va.va_gid;
2104 mutex_exit(&rp->r_statelock);
2105 return (0);
2109 setdirmode(vnode_t *dvp, mode_t *omp, cred_t *cr)
2111 int error;
2112 struct vattr va;
2114 va.va_mask = VATTR_MODE;
2115 error = fop_getattr(dvp, &va, 0, cr, NULL);
2116 if (error)
2117 return (error);
2120 * Modify the expected mode (om) so that the set-gid bit matches
2121 * that of the parent directory (dvp).
2123 if (va.va_mode & VSGID)
2124 *omp |= VSGID;
2125 else
2126 *omp &= ~VSGID;
2127 return (0);
2130 void
2131 nfs_setswaplike(vnode_t *vp, vattr_t *vap)
2134 if (vp->v_type == VREG && (vap->va_mode & (VEXEC | VSVTX)) == VSVTX) {
2135 if (!(vp->v_flag & VSWAPLIKE)) {
2136 mutex_enter(&vp->v_lock);
2137 vp->v_flag |= VSWAPLIKE;
2138 mutex_exit(&vp->v_lock);
2140 } else {
2141 if (vp->v_flag & VSWAPLIKE) {
2142 mutex_enter(&vp->v_lock);
2143 vp->v_flag &= ~VSWAPLIKE;
2144 mutex_exit(&vp->v_lock);
2150 * Free the resources associated with an rnode.
2152 static void
2153 rinactive(rnode_t *rp, cred_t *cr)
2155 vnode_t *vp;
2156 cred_t *cred;
2157 char *contents;
2158 int size;
2159 vsecattr_t *vsp;
2160 int error;
2161 nfs3_pathconf_info *info;
2164 * Before freeing anything, wait until all asynchronous
2165 * activity is done on this rnode. This will allow all
2166 * asynchronous read ahead and write behind i/o's to
2167 * finish.
2169 mutex_enter(&rp->r_statelock);
2170 while (rp->r_count > 0)
2171 cv_wait(&rp->r_cv, &rp->r_statelock);
2172 mutex_exit(&rp->r_statelock);
2175 * Flush and invalidate all pages associated with the vnode.
2177 vp = RTOV(rp);
2178 if (vn_has_cached_data(vp)) {
2179 ASSERT(vp->v_type != VCHR);
2180 if ((rp->r_flags & RDIRTY) && !rp->r_error) {
2181 error = fop_putpage(vp, 0, 0, 0, cr, NULL);
2182 if (error && (error == ENOSPC || error == EDQUOT)) {
2183 mutex_enter(&rp->r_statelock);
2184 if (!rp->r_error)
2185 rp->r_error = error;
2186 mutex_exit(&rp->r_statelock);
2189 nfs_invalidate_pages(vp, 0, cr);
2193 * Free any held credentials and caches which may be associated
2194 * with this rnode.
2196 mutex_enter(&rp->r_statelock);
2197 cred = rp->r_cred;
2198 rp->r_cred = NULL;
2199 contents = rp->r_symlink.contents;
2200 size = rp->r_symlink.size;
2201 rp->r_symlink.contents = NULL;
2202 vsp = rp->r_secattr;
2203 rp->r_secattr = NULL;
2204 info = rp->r_pathconf;
2205 rp->r_pathconf = NULL;
2206 mutex_exit(&rp->r_statelock);
2209 * Free the held credential.
2211 if (cred != NULL)
2212 crfree(cred);
2215 * Free the access cache entries.
2217 (void) nfs_access_purge_rp(rp);
2220 * Free the readdir cache entries.
2222 if (HAVE_RDDIR_CACHE(rp))
2223 nfs_purge_rddir_cache(vp);
2226 * Free the symbolic link cache.
2228 if (contents != NULL) {
2230 kmem_free((void *)contents, size);
2234 * Free any cached ACL.
2236 if (vsp != NULL)
2237 nfs_acl_free(vsp);
2240 * Free any cached pathconf information.
2242 if (info != NULL)
2243 kmem_free(info, sizeof (*info));
2247 * Return a vnode for the given NFS Version 2 file handle.
2248 * If no rnode exists for this fhandle, create one and put it
2249 * into the hash queues. If the rnode for this fhandle
2250 * already exists, return it.
2252 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2254 vnode_t *
2255 makenfsnode(fhandle_t *fh, struct nfsfattr *attr, struct vfs *vfsp,
2256 hrtime_t t, cred_t *cr, char *dnm, char *nm)
2258 int newnode;
2259 int index;
2260 vnode_t *vp;
2261 nfs_fhandle nfh;
2262 vattr_t va;
2264 nfh.fh_len = NFS_FHSIZE;
2265 bcopy(fh, nfh.fh_buf, NFS_FHSIZE);
2267 index = rtablehash(&nfh);
2268 rw_enter(&rtable[index].r_lock, RW_READER);
2270 vp = make_rnode(&nfh, &rtable[index], vfsp, &nfs_vnodeops,
2271 nfs_putapage, nfs_rddir_compar, &newnode, cr, dnm, nm);
2273 if (attr != NULL) {
2274 if (!newnode) {
2275 rw_exit(&rtable[index].r_lock);
2276 (void) nfs_cache_fattr(vp, attr, &va, t, cr);
2277 } else {
2278 if (attr->na_type < NFNON || attr->na_type > NFSOC)
2279 vp->v_type = VBAD;
2280 else
2281 vp->v_type = n2v_type(attr);
2283 * A translation here seems to be necessary
2284 * because this function can be called
2285 * with `attr' that has come from the wire,
2286 * and been operated on by vattr_to_nattr().
2287 * See nfsrootvp()->VOP_GETTATTR()->nfsgetattr()
2288 * ->nfs_getattr_otw()->rfscall()->vattr_to_nattr()
2289 * ->makenfsnode().
2291 if ((attr->na_rdev & 0xffff0000) == 0)
2292 vp->v_rdev = nfsv2_expdev(attr->na_rdev);
2293 else
2294 vp->v_rdev = expldev(n2v_rdev(attr));
2295 nfs_attrcache(vp, attr, t);
2296 rw_exit(&rtable[index].r_lock);
2298 } else {
2299 if (newnode) {
2300 PURGE_ATTRCACHE(vp);
2302 rw_exit(&rtable[index].r_lock);
2305 return (vp);
2309 * Return a vnode for the given NFS Version 3 file handle.
2310 * If no rnode exists for this fhandle, create one and put it
2311 * into the hash queues. If the rnode for this fhandle
2312 * already exists, return it.
2314 * Note: make_rnode() may upgrade the hash bucket lock to exclusive.
2316 vnode_t *
2317 makenfs3node_va(nfs_fh3 *fh, vattr_t *vap, struct vfs *vfsp, hrtime_t t,
2318 cred_t *cr, char *dnm, char *nm)
2320 int newnode;
2321 int index;
2322 vnode_t *vp;
2324 index = rtablehash((nfs_fhandle *)fh);
2325 rw_enter(&rtable[index].r_lock, RW_READER);
2327 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2328 &nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2329 dnm, nm);
2331 if (vap == NULL) {
2332 if (newnode) {
2333 PURGE_ATTRCACHE(vp);
2335 rw_exit(&rtable[index].r_lock);
2336 return (vp);
2339 if (!newnode) {
2340 rw_exit(&rtable[index].r_lock);
2341 nfs_attr_cache(vp, vap, t, cr);
2342 } else {
2343 rnode_t *rp = VTOR(vp);
2345 vp->v_type = vap->va_type;
2346 vp->v_rdev = vap->va_rdev;
2348 mutex_enter(&rp->r_statelock);
2349 if (rp->r_mtime <= t)
2350 nfs_attrcache_va(vp, vap);
2351 mutex_exit(&rp->r_statelock);
2352 rw_exit(&rtable[index].r_lock);
2355 return (vp);
2358 vnode_t *
2359 makenfs3node(nfs_fh3 *fh, fattr3 *attr, struct vfs *vfsp, hrtime_t t,
2360 cred_t *cr, char *dnm, char *nm)
2362 int newnode;
2363 int index;
2364 vnode_t *vp;
2365 vattr_t va;
2367 index = rtablehash((nfs_fhandle *)fh);
2368 rw_enter(&rtable[index].r_lock, RW_READER);
2370 vp = make_rnode((nfs_fhandle *)fh, &rtable[index], vfsp,
2371 &nfs3_vnodeops, nfs3_putapage, nfs3_rddir_compar, &newnode, cr,
2372 dnm, nm);
2374 if (attr == NULL) {
2375 if (newnode) {
2376 PURGE_ATTRCACHE(vp);
2378 rw_exit(&rtable[index].r_lock);
2379 return (vp);
2382 if (!newnode) {
2383 rw_exit(&rtable[index].r_lock);
2384 (void) nfs3_cache_fattr3(vp, attr, &va, t, cr);
2385 } else {
2386 if (attr->type < NF3REG || attr->type > NF3FIFO)
2387 vp->v_type = VBAD;
2388 else
2389 vp->v_type = nf3_to_vt[attr->type];
2390 vp->v_rdev = makedevice(attr->rdev.specdata1,
2391 attr->rdev.specdata2);
2392 nfs3_attrcache(vp, attr, t);
2393 rw_exit(&rtable[index].r_lock);
2396 return (vp);
2400 * Read this comment before making changes to rtablehash()!
2401 * This is a hash function in which seemingly obvious and harmless
2402 * changes can cause escalations costing million dollars!
2403 * Know what you are doing.
2405 * rtablehash() implements Jenkins' one-at-a-time hash algorithm. The
2406 * algorithm is currently detailed here:
2408 * http://burtleburtle.net/bob/hash/doobs.html
2410 * Of course, the above link may not be valid by the time you are reading
2411 * this, but suffice it to say that the one-at-a-time algorithm works well in
2412 * almost all cases. If you are changing the algorithm be sure to verify that
2413 * the hash algorithm still provides even distribution in all cases and with
2414 * any server returning filehandles in whatever order (sequential or random).
2416 static int
2417 rtablehash(nfs_fhandle *fh)
2419 ulong_t hash, len, i;
2420 char *key;
2422 key = fh->fh_buf;
2423 len = (ulong_t)fh->fh_len;
2424 for (hash = 0, i = 0; i < len; i++) {
2425 hash += key[i];
2426 hash += (hash << 10);
2427 hash ^= (hash >> 6);
2429 hash += (hash << 3);
2430 hash ^= (hash >> 11);
2431 hash += (hash << 15);
2432 return (hash & rtablemask);
2435 static vnode_t *
2436 make_rnode(nfs_fhandle *fh, rhashq_t *rhtp, struct vfs *vfsp,
2437 const struct vnodeops *vops,
2438 int (*putapage)(vnode_t *, page_t *, uoff_t *, size_t *, int, cred_t *),
2439 int (*compar)(const void *, const void *),
2440 int *newnode, cred_t *cr, char *dnm, char *nm)
2442 rnode_t *rp;
2443 rnode_t *trp;
2444 vnode_t *vp;
2445 mntinfo_t *mi;
2447 ASSERT(RW_READ_HELD(&rhtp->r_lock));
2449 mi = VFTOMI(vfsp);
2450 start:
2451 if ((rp = rfind(rhtp, fh, vfsp)) != NULL) {
2452 vp = RTOV(rp);
2453 nfs_set_vroot(vp);
2454 *newnode = 0;
2455 return (vp);
2457 rw_exit(&rhtp->r_lock);
2459 mutex_enter(&rpfreelist_lock);
2460 if (rpfreelist != NULL && rnew >= nrnode) {
2461 rp = rpfreelist;
2462 rp_rmfree(rp);
2463 mutex_exit(&rpfreelist_lock);
2465 vp = RTOV(rp);
2467 if (rp->r_flags & RHASHED) {
2468 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2469 mutex_enter(&vp->v_lock);
2470 if (vp->v_count > 1) {
2471 VN_RELE_LOCKED(vp);
2472 mutex_exit(&vp->v_lock);
2473 rw_exit(&rp->r_hashq->r_lock);
2474 rw_enter(&rhtp->r_lock, RW_READER);
2475 goto start;
2477 mutex_exit(&vp->v_lock);
2478 rp_rmhash_locked(rp);
2479 rw_exit(&rp->r_hashq->r_lock);
2482 rinactive(rp, cr);
2484 mutex_enter(&vp->v_lock);
2485 if (vp->v_count > 1) {
2486 VN_RELE_LOCKED(vp);
2487 mutex_exit(&vp->v_lock);
2488 rw_enter(&rhtp->r_lock, RW_READER);
2489 goto start;
2491 mutex_exit(&vp->v_lock);
2492 vn_invalid(vp);
2494 * destroy old locks before bzero'ing and
2495 * recreating the locks below.
2497 nfs_rw_destroy(&rp->r_rwlock);
2498 nfs_rw_destroy(&rp->r_lkserlock);
2499 mutex_destroy(&rp->r_statelock);
2500 cv_destroy(&rp->r_cv);
2501 cv_destroy(&rp->r_commit.c_cv);
2502 nfs_free_r_path(rp);
2503 avl_destroy(&rp->r_dir);
2505 * Make sure that if rnode is recycled then
2506 * VFS count is decremented properly before
2507 * reuse.
2509 VFS_RELE(vp->v_vfsp);
2510 vn_reinit(vp);
2511 } else {
2512 vnode_t *new_vp;
2514 mutex_exit(&rpfreelist_lock);
2516 rp = kmem_cache_alloc(rnode_cache, KM_SLEEP);
2517 new_vp = vn_alloc(KM_SLEEP);
2519 atomic_inc_ulong((ulong_t *)&rnew);
2520 #ifdef DEBUG
2521 clstat_debug.nrnode.value.ui64++;
2522 #endif
2523 vp = new_vp;
2526 bzero(rp, sizeof (*rp));
2527 rp->r_vnode = vp;
2528 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
2529 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
2530 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
2531 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
2532 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
2533 rp->r_fh.fh_len = fh->fh_len;
2534 bcopy(fh->fh_buf, rp->r_fh.fh_buf, fh->fh_len);
2535 rp->r_server = mi->mi_curr_serv;
2536 if (FAILOVER_MOUNT(mi)) {
2538 * If replicated servers, stash pathnames
2540 if (dnm != NULL && nm != NULL) {
2541 char *s, *p;
2542 uint_t len;
2544 len = (uint_t)(strlen(dnm) + strlen(nm) + 2);
2545 rp->r_path = kmem_alloc(len, KM_SLEEP);
2546 #ifdef DEBUG
2547 clstat_debug.rpath.value.ui64 += len;
2548 #endif
2549 s = rp->r_path;
2550 for (p = dnm; *p; p++)
2551 *s++ = *p;
2552 *s++ = '/';
2553 for (p = nm; *p; p++)
2554 *s++ = *p;
2555 *s = '\0';
2556 } else {
2557 /* special case for root */
2558 rp->r_path = kmem_alloc(2, KM_SLEEP);
2559 #ifdef DEBUG
2560 clstat_debug.rpath.value.ui64 += 2;
2561 #endif
2562 *rp->r_path = '.';
2563 *(rp->r_path + 1) = '\0';
2566 VFS_HOLD(vfsp);
2567 rp->r_putapage = putapage;
2568 rp->r_hashq = rhtp;
2569 rp->r_flags = RREADDIRPLUS;
2570 avl_create(&rp->r_dir, compar, sizeof (rddir_cache),
2571 offsetof(rddir_cache, tree));
2572 vn_setops(vp, vops);
2573 vp->v_data = (caddr_t)rp;
2574 vp->v_vfsp = vfsp;
2575 vp->v_type = VNON;
2576 vp->v_flag |= VMODSORT;
2577 nfs_set_vroot(vp);
2580 * There is a race condition if someone else
2581 * alloc's the rnode while no locks are held, so we
2582 * check again and recover if found.
2584 rw_enter(&rhtp->r_lock, RW_WRITER);
2585 if ((trp = rfind(rhtp, fh, vfsp)) != NULL) {
2586 vp = RTOV(trp);
2587 nfs_set_vroot(vp);
2588 *newnode = 0;
2589 rw_exit(&rhtp->r_lock);
2590 rp_addfree(rp, cr);
2591 rw_enter(&rhtp->r_lock, RW_READER);
2592 return (vp);
2594 rp_addhash(rp);
2595 *newnode = 1;
2596 return (vp);
2600 * Callback function to check if the page should be marked as
2601 * modified. In the positive case, p_fsdata is set to C_NOCOMMIT.
2604 nfs_setmod_check(page_t *pp)
2606 if (pp->p_fsdata != C_NOCOMMIT) {
2607 pp->p_fsdata = C_NOCOMMIT;
2608 return (1);
2610 return (0);
2613 static void
2614 nfs_set_vroot(vnode_t *vp)
2616 rnode_t *rp;
2617 nfs_fhandle *rootfh;
2619 rp = VTOR(vp);
2620 rootfh = &rp->r_server->sv_fhandle;
2621 if (rootfh->fh_len == rp->r_fh.fh_len &&
2622 bcmp(rootfh->fh_buf, rp->r_fh.fh_buf, rp->r_fh.fh_len) == 0) {
2623 if (!(vp->v_flag & VROOT)) {
2624 mutex_enter(&vp->v_lock);
2625 vp->v_flag |= VROOT;
2626 mutex_exit(&vp->v_lock);
2631 static void
2632 nfs_free_r_path(rnode_t *rp)
2634 char *path;
2635 size_t len;
2637 path = rp->r_path;
2638 if (path) {
2639 rp->r_path = NULL;
2640 len = strlen(path) + 1;
2641 kmem_free(path, len);
2642 #ifdef DEBUG
2643 clstat_debug.rpath.value.ui64 -= len;
2644 #endif
2649 * Put an rnode on the free list.
2651 * Rnodes which were allocated above and beyond the normal limit
2652 * are immediately freed.
2654 void
2655 rp_addfree(rnode_t *rp, cred_t *cr)
2657 vnode_t *vp;
2658 struct vfs *vfsp;
2660 vp = RTOV(rp);
2661 ASSERT(vp->v_count >= 1);
2662 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
2665 * If we have too many rnodes allocated and there are no
2666 * references to this rnode, or if the rnode is no longer
2667 * accessible by it does not reside in the hash queues,
2668 * or if an i/o error occurred while writing to the file,
2669 * then just free it instead of putting it on the rnode
2670 * freelist.
2672 vfsp = vp->v_vfsp;
2673 if (((rnew > nrnode || !(rp->r_flags & RHASHED) || rp->r_error ||
2674 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
2675 if (rp->r_flags & RHASHED) {
2676 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2677 mutex_enter(&vp->v_lock);
2678 if (vp->v_count > 1) {
2679 VN_RELE_LOCKED(vp);
2680 mutex_exit(&vp->v_lock);
2681 rw_exit(&rp->r_hashq->r_lock);
2682 return;
2684 mutex_exit(&vp->v_lock);
2685 rp_rmhash_locked(rp);
2686 rw_exit(&rp->r_hashq->r_lock);
2689 rinactive(rp, cr);
2692 * Recheck the vnode reference count. We need to
2693 * make sure that another reference has not been
2694 * acquired while we were not holding v_lock. The
2695 * rnode is not in the rnode hash queues, so the
2696 * only way for a reference to have been acquired
2697 * is for a fop_putpage because the rnode was marked
2698 * with RDIRTY or for a modified page. This
2699 * reference may have been acquired before our call
2700 * to rinactive. The i/o may have been completed,
2701 * thus allowing rinactive to complete, but the
2702 * reference to the vnode may not have been released
2703 * yet. In any case, the rnode can not be destroyed
2704 * until the other references to this vnode have been
2705 * released. The other references will take care of
2706 * either destroying the rnode or placing it on the
2707 * rnode freelist. If there are no other references,
2708 * then the rnode may be safely destroyed.
2710 mutex_enter(&vp->v_lock);
2711 if (vp->v_count > 1) {
2712 VN_RELE_LOCKED(vp);
2713 mutex_exit(&vp->v_lock);
2714 return;
2716 mutex_exit(&vp->v_lock);
2718 destroy_rnode(rp);
2719 return;
2723 * Lock the hash queue and then recheck the reference count
2724 * to ensure that no other threads have acquired a reference
2725 * to indicate that the rnode should not be placed on the
2726 * freelist. If another reference has been acquired, then
2727 * just release this one and let the other thread complete
2728 * the processing of adding this rnode to the freelist.
2730 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2732 mutex_enter(&vp->v_lock);
2733 if (vp->v_count > 1) {
2734 VN_RELE_LOCKED(vp);
2735 mutex_exit(&vp->v_lock);
2736 rw_exit(&rp->r_hashq->r_lock);
2737 return;
2739 mutex_exit(&vp->v_lock);
2742 * If there is no cached data or metadata for this file, then
2743 * put the rnode on the front of the freelist so that it will
2744 * be reused before other rnodes which may have cached data or
2745 * metadata associated with them.
2747 mutex_enter(&rpfreelist_lock);
2748 if (rpfreelist == NULL) {
2749 rp->r_freef = rp;
2750 rp->r_freeb = rp;
2751 rpfreelist = rp;
2752 } else {
2753 rp->r_freef = rpfreelist;
2754 rp->r_freeb = rpfreelist->r_freeb;
2755 rpfreelist->r_freeb->r_freef = rp;
2756 rpfreelist->r_freeb = rp;
2757 if (!vn_has_cached_data(vp) &&
2758 !HAVE_RDDIR_CACHE(rp) &&
2759 rp->r_symlink.contents == NULL &&
2760 rp->r_secattr == NULL &&
2761 rp->r_pathconf == NULL)
2762 rpfreelist = rp;
2764 mutex_exit(&rpfreelist_lock);
2766 rw_exit(&rp->r_hashq->r_lock);
2770 * Remove an rnode from the free list.
2772 * The caller must be holding rpfreelist_lock and the rnode
2773 * must be on the freelist.
2775 static void
2776 rp_rmfree(rnode_t *rp)
2779 ASSERT(MUTEX_HELD(&rpfreelist_lock));
2780 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
2782 if (rp == rpfreelist) {
2783 rpfreelist = rp->r_freef;
2784 if (rp == rpfreelist)
2785 rpfreelist = NULL;
2788 rp->r_freeb->r_freef = rp->r_freef;
2789 rp->r_freef->r_freeb = rp->r_freeb;
2791 rp->r_freef = rp->r_freeb = NULL;
2795 * Put a rnode in the hash table.
2797 * The caller must be holding the exclusive hash queue lock.
2799 static void
2800 rp_addhash(rnode_t *rp)
2802 mntinfo_t *mi;
2804 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2805 ASSERT(!(rp->r_flags & RHASHED));
2807 rp->r_hashf = rp->r_hashq->r_hashf;
2808 rp->r_hashq->r_hashf = rp;
2809 rp->r_hashb = (rnode_t *)rp->r_hashq;
2810 rp->r_hashf->r_hashb = rp;
2812 mutex_enter(&rp->r_statelock);
2813 rp->r_flags |= RHASHED;
2814 mutex_exit(&rp->r_statelock);
2816 mi = VTOMI(RTOV(rp));
2817 mutex_enter(&mi->mi_rnodes_lock);
2818 list_insert_tail(&mi->mi_rnodes, rp);
2819 mutex_exit(&mi->mi_rnodes_lock);
2823 * Remove a rnode from the hash table.
2825 * The caller must be holding the hash queue lock.
2827 static void
2828 rp_rmhash_locked(rnode_t *rp)
2830 mntinfo_t *mi;
2832 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
2833 ASSERT(rp->r_flags & RHASHED);
2835 rp->r_hashb->r_hashf = rp->r_hashf;
2836 rp->r_hashf->r_hashb = rp->r_hashb;
2838 mutex_enter(&rp->r_statelock);
2839 rp->r_flags &= ~RHASHED;
2840 mutex_exit(&rp->r_statelock);
2842 mi = VTOMI(RTOV(rp));
2843 mutex_enter(&mi->mi_rnodes_lock);
2844 if (list_link_active(&rp->r_mi_link))
2845 list_remove(&mi->mi_rnodes, rp);
2846 mutex_exit(&mi->mi_rnodes_lock);
2850 * Remove a rnode from the hash table.
2852 * The caller must not be holding the hash queue lock.
2854 void
2855 rp_rmhash(rnode_t *rp)
2858 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
2859 rp_rmhash_locked(rp);
2860 rw_exit(&rp->r_hashq->r_lock);
2864 * Lookup a rnode by fhandle.
2866 * The caller must be holding the hash queue lock, either shared or exclusive.
2868 static rnode_t *
2869 rfind(rhashq_t *rhtp, nfs_fhandle *fh, struct vfs *vfsp)
2871 rnode_t *rp;
2872 vnode_t *vp;
2874 ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
2876 for (rp = rhtp->r_hashf; rp != (rnode_t *)rhtp; rp = rp->r_hashf) {
2877 vp = RTOV(rp);
2878 if (vp->v_vfsp == vfsp &&
2879 rp->r_fh.fh_len == fh->fh_len &&
2880 bcmp(rp->r_fh.fh_buf, fh->fh_buf, fh->fh_len) == 0) {
2882 * remove rnode from free list, if necessary.
2884 if (rp->r_freef != NULL) {
2885 mutex_enter(&rpfreelist_lock);
2887 * If the rnode is on the freelist,
2888 * then remove it and use that reference
2889 * as the new reference. Otherwise,
2890 * need to increment the reference count.
2892 if (rp->r_freef != NULL) {
2893 rp_rmfree(rp);
2894 mutex_exit(&rpfreelist_lock);
2895 } else {
2896 mutex_exit(&rpfreelist_lock);
2897 VN_HOLD(vp);
2899 } else
2900 VN_HOLD(vp);
2901 return (rp);
2904 return (NULL);
2908 * Return 1 if there is an active vnode belonging to this vfs in the
2909 * rtable cache.
2911 * Several of these checks are done without holding the usual
2912 * locks. This is safe because destroy_rtable(), rp_addfree(),
2913 * etc. will redo the necessary checks before actually destroying
2914 * any rnodes.
2917 check_rtable(struct vfs *vfsp)
2919 rnode_t *rp;
2920 vnode_t *vp;
2921 mntinfo_t *mi;
2923 ASSERT(vfsp != NULL);
2924 mi = VFTOMI(vfsp);
2926 mutex_enter(&mi->mi_rnodes_lock);
2927 for (rp = list_head(&mi->mi_rnodes); rp != NULL;
2928 rp = list_next(&mi->mi_rnodes, rp)) {
2929 vp = RTOV(rp);
2931 if (rp->r_freef == NULL ||
2932 (vn_has_cached_data(vp) && (rp->r_flags & RDIRTY)) ||
2933 rp->r_count > 0) {
2934 mutex_exit(&mi->mi_rnodes_lock);
2935 return (1);
2938 mutex_exit(&mi->mi_rnodes_lock);
2940 return (0);
2944 * Destroy inactive vnodes from the hash queues which belong to this
2945 * vfs. It is essential that we destroy all inactive vnodes during a
2946 * forced unmount as well as during a normal unmount.
2948 void
2949 destroy_rtable(struct vfs *vfsp, cred_t *cr)
2951 rnode_t *rp;
2952 mntinfo_t *mi;
2954 ASSERT(vfsp != NULL);
2956 mi = VFTOMI(vfsp);
2958 mutex_enter(&rpfreelist_lock);
2959 mutex_enter(&mi->mi_rnodes_lock);
2960 while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
2962 * If the rnode is no longer on the freelist it is not
2963 * ours and it will be handled by some other thread, so
2964 * skip it.
2966 if (rp->r_freef == NULL)
2967 continue;
2968 mutex_exit(&mi->mi_rnodes_lock);
2970 rp_rmfree(rp);
2971 mutex_exit(&rpfreelist_lock);
2973 rp_rmhash(rp);
2976 * This call to rp_addfree will end up destroying the
2977 * rnode, but in a safe way with the appropriate set
2978 * of checks done.
2980 rp_addfree(rp, cr);
2982 mutex_enter(&rpfreelist_lock);
2983 mutex_enter(&mi->mi_rnodes_lock);
2985 mutex_exit(&mi->mi_rnodes_lock);
2986 mutex_exit(&rpfreelist_lock);
2990 * This routine destroys all the resources associated with the rnode
2991 * and then the rnode itself.
2993 static void
2994 destroy_rnode(rnode_t *rp)
2996 vnode_t *vp;
2997 vfs_t *vfsp;
2999 vp = RTOV(rp);
3000 vfsp = vp->v_vfsp;
3002 ASSERT(vp->v_count == 1);
3003 ASSERT(rp->r_count == 0);
3004 ASSERT(rp->r_lmpl == NULL);
3005 ASSERT(rp->r_mapcnt == 0);
3006 ASSERT(!(rp->r_flags & RHASHED));
3007 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
3008 atomic_dec_ulong((ulong_t *)&rnew);
3009 #ifdef DEBUG
3010 clstat_debug.nrnode.value.ui64--;
3011 #endif
3012 nfs_rw_destroy(&rp->r_rwlock);
3013 nfs_rw_destroy(&rp->r_lkserlock);
3014 mutex_destroy(&rp->r_statelock);
3015 cv_destroy(&rp->r_cv);
3016 cv_destroy(&rp->r_commit.c_cv);
3017 if (rp->r_flags & RDELMAPLIST)
3018 list_destroy(&rp->r_indelmap);
3019 nfs_free_r_path(rp);
3020 avl_destroy(&rp->r_dir);
3021 vn_invalid(vp);
3022 vn_free(vp);
3023 kmem_cache_free(rnode_cache, rp);
3024 VFS_RELE(vfsp);
3028 * Flush all vnodes in this (or every) vfs.
3029 * Used by nfs_sync and by nfs_unmount.
3031 void
3032 rflush(struct vfs *vfsp, cred_t *cr)
3034 int index;
3035 rnode_t *rp;
3036 vnode_t *vp, **vplist;
3037 long num, cnt;
3040 * Check to see whether there is anything to do.
3042 num = rnew;
3043 if (num == 0)
3044 return;
3047 * Allocate a slot for all currently active rnodes on the
3048 * supposition that they all may need flushing.
3050 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
3051 cnt = 0;
3054 * If the vfs is known we can do fast path by iterating all rnodes that
3055 * belongs to this vfs. This is much faster than the traditional way
3056 * of iterating rtable (below) in a case there is a lot of rnodes that
3057 * does not belong to our vfs.
3059 if (vfsp != NULL) {
3060 mntinfo_t *mi = VFTOMI(vfsp);
3062 mutex_enter(&mi->mi_rnodes_lock);
3063 for (rp = list_head(&mi->mi_rnodes); rp != NULL;
3064 rp = list_next(&mi->mi_rnodes, rp)) {
3065 vp = RTOV(rp);
3067 * Don't bother sync'ing a vp if it
3068 * is part of virtual swap device or
3069 * if VFS is read-only
3071 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3072 continue;
3074 * If the vnode has pages and is marked as either dirty
3075 * or mmap'd, hold and add this vnode to the list of
3076 * vnodes to flush.
3078 ASSERT(vp->v_vfsp == vfsp);
3079 if (vn_has_cached_data(vp) &&
3080 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3081 VN_HOLD(vp);
3082 vplist[cnt++] = vp;
3083 if (cnt == num) {
3085 * The vplist is full because there is
3086 * too many rnodes. We are done for
3087 * now.
3089 break;
3093 mutex_exit(&mi->mi_rnodes_lock);
3095 goto done;
3098 ASSERT(vfsp == NULL);
3101 * Walk the hash queues looking for rnodes with page
3102 * lists associated with them. Make a list of these
3103 * files.
3105 for (index = 0; index < rtablesize; index++) {
3106 rw_enter(&rtable[index].r_lock, RW_READER);
3107 for (rp = rtable[index].r_hashf;
3108 rp != (rnode_t *)(&rtable[index]);
3109 rp = rp->r_hashf) {
3110 vp = RTOV(rp);
3112 * Don't bother sync'ing a vp if it
3113 * is part of virtual swap device or
3114 * if VFS is read-only
3116 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
3117 continue;
3119 * If the vnode has pages and is marked as either dirty
3120 * or mmap'd, hold and add this vnode to the list of
3121 * vnodes to flush.
3123 if (vn_has_cached_data(vp) &&
3124 ((rp->r_flags & RDIRTY) || rp->r_mapcnt > 0)) {
3125 VN_HOLD(vp);
3126 vplist[cnt++] = vp;
3127 if (cnt == num) {
3128 rw_exit(&rtable[index].r_lock);
3130 * The vplist is full because there is
3131 * too many rnodes. We are done for
3132 * now.
3134 goto done;
3138 rw_exit(&rtable[index].r_lock);
3141 done:
3144 * Flush and release all of the files on the list.
3146 while (cnt-- > 0) {
3147 vp = vplist[cnt];
3148 (void) fop_putpage(vp, 0, 0, B_ASYNC, cr, NULL);
3149 VN_RELE(vp);
3153 * Free the space allocated to hold the list.
3155 kmem_free(vplist, num * sizeof (*vplist));
3159 * This probably needs to be larger than or equal to
3160 * log2(sizeof (struct rnode)) due to the way that rnodes are
3161 * allocated.
3163 #define ACACHE_SHIFT_BITS 9
3165 static int
3166 acachehash(rnode_t *rp, cred_t *cr)
3169 return ((((intptr_t)rp >> ACACHE_SHIFT_BITS) + crgetuid(cr)) &
3170 acachemask);
3173 #ifdef DEBUG
3174 static long nfs_access_cache_hits = 0;
3175 static long nfs_access_cache_misses = 0;
3176 #endif
3178 nfs_access_type_t
3179 nfs_access_check(rnode_t *rp, uint32_t acc, cred_t *cr)
3181 vnode_t *vp;
3182 acache_t *ap;
3183 acache_hash_t *hp;
3184 nfs_access_type_t all;
3186 vp = RTOV(rp);
3187 if (!ATTRCACHE_VALID(vp) || nfs_waitfor_purge_complete(vp))
3188 return (NFS_ACCESS_UNKNOWN);
3190 if (rp->r_acache != NULL) {
3191 hp = &acache[acachehash(rp, cr)];
3192 rw_enter(&hp->lock, RW_READER);
3193 ap = hp->next;
3194 while (ap != (acache_t *)hp) {
3195 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3196 if ((ap->known & acc) == acc) {
3197 #ifdef DEBUG
3198 nfs_access_cache_hits++;
3199 #endif
3200 if ((ap->allowed & acc) == acc)
3201 all = NFS_ACCESS_ALLOWED;
3202 else
3203 all = NFS_ACCESS_DENIED;
3204 } else {
3205 #ifdef DEBUG
3206 nfs_access_cache_misses++;
3207 #endif
3208 all = NFS_ACCESS_UNKNOWN;
3210 rw_exit(&hp->lock);
3211 return (all);
3213 ap = ap->next;
3215 rw_exit(&hp->lock);
3218 #ifdef DEBUG
3219 nfs_access_cache_misses++;
3220 #endif
3221 return (NFS_ACCESS_UNKNOWN);
3224 void
3225 nfs_access_cache(rnode_t *rp, uint32_t acc, uint32_t resacc, cred_t *cr)
3227 acache_t *ap;
3228 acache_t *nap;
3229 acache_hash_t *hp;
3231 hp = &acache[acachehash(rp, cr)];
3234 * Allocate now assuming that mostly an allocation will be
3235 * required. This allows the allocation to happen without
3236 * holding the hash bucket locked.
3238 nap = kmem_cache_alloc(acache_cache, KM_NOSLEEP);
3239 if (nap != NULL) {
3240 nap->known = acc;
3241 nap->allowed = resacc;
3242 nap->rnode = rp;
3243 crhold(cr);
3244 nap->cred = cr;
3245 nap->hashq = hp;
3248 rw_enter(&hp->lock, RW_WRITER);
3250 if (rp->r_acache != NULL) {
3251 ap = hp->next;
3252 while (ap != (acache_t *)hp) {
3253 if (crcmp(ap->cred, cr) == 0 && ap->rnode == rp) {
3254 ap->known |= acc;
3255 ap->allowed &= ~acc;
3256 ap->allowed |= resacc;
3257 rw_exit(&hp->lock);
3258 if (nap != NULL) {
3259 crfree(nap->cred);
3260 kmem_cache_free(acache_cache, nap);
3262 return;
3264 ap = ap->next;
3268 if (nap != NULL) {
3269 #ifdef DEBUG
3270 clstat_debug.access.value.ui64++;
3271 #endif
3272 nap->next = hp->next;
3273 hp->next = nap;
3274 nap->next->prev = nap;
3275 nap->prev = (acache_t *)hp;
3277 mutex_enter(&rp->r_statelock);
3278 nap->list = rp->r_acache;
3279 rp->r_acache = nap;
3280 mutex_exit(&rp->r_statelock);
3283 rw_exit(&hp->lock);
3287 nfs_access_purge_rp(rnode_t *rp)
3289 acache_t *ap;
3290 acache_t *tmpap;
3291 acache_t *rplist;
3294 * If there aren't any cached entries, then there is nothing
3295 * to free.
3297 if (rp->r_acache == NULL)
3298 return (0);
3300 mutex_enter(&rp->r_statelock);
3301 rplist = rp->r_acache;
3302 rp->r_acache = NULL;
3303 mutex_exit(&rp->r_statelock);
3306 * Loop through each entry in the list pointed to in the
3307 * rnode. Remove each of these entries from the hash
3308 * queue that it is on and remove it from the list in
3309 * the rnode.
3311 for (ap = rplist; ap != NULL; ap = tmpap) {
3312 rw_enter(&ap->hashq->lock, RW_WRITER);
3313 ap->prev->next = ap->next;
3314 ap->next->prev = ap->prev;
3315 rw_exit(&ap->hashq->lock);
3317 tmpap = ap->list;
3318 crfree(ap->cred);
3319 kmem_cache_free(acache_cache, ap);
3320 #ifdef DEBUG
3321 clstat_debug.access.value.ui64--;
3322 #endif
3325 return (1);
3328 static const char prefix[] = ".nfs";
3330 static kmutex_t newnum_lock;
3333 newnum(void)
3335 static uint_t newnum = 0;
3336 uint_t id;
3338 mutex_enter(&newnum_lock);
3339 if (newnum == 0)
3340 newnum = gethrestime_sec() & 0xffff;
3341 id = newnum++;
3342 mutex_exit(&newnum_lock);
3343 return (id);
3346 char *
3347 newname(void)
3349 char *news;
3350 char *s;
3351 const char *p;
3352 uint_t id;
3354 id = newnum();
3355 news = kmem_alloc(MAXNAMELEN, KM_SLEEP);
3356 s = news;
3357 p = prefix;
3358 while (*p != '\0')
3359 *s++ = *p++;
3360 while (id != 0) {
3361 *s++ = "0123456789ABCDEF"[id & 0x0f];
3362 id >>= 4;
3364 *s = '\0';
3365 return (news);
3369 * Snapshot callback for nfs:0:nfs_client as registered with the kstat
3370 * framework.
3372 static int
3373 cl_snapshot(kstat_t *ksp, void *buf, int rw)
3375 ksp->ks_snaptime = gethrtime();
3376 if (rw == KSTAT_WRITE) {
3377 bcopy(buf, ksp->ks_private, sizeof (clstat_tmpl));
3378 #ifdef DEBUG
3380 * Currently only the global zone can write to kstats, but we
3381 * add the check just for paranoia.
3383 if (INGLOBALZONE(curproc))
3384 bcopy((char *)buf + sizeof (clstat_tmpl), &clstat_debug,
3385 sizeof (clstat_debug));
3386 #endif
3387 } else {
3388 bcopy(ksp->ks_private, buf, sizeof (clstat_tmpl));
3389 #ifdef DEBUG
3391 * If we're displaying the "global" debug kstat values, we
3392 * display them as-is to all zones since in fact they apply to
3393 * the system as a whole.
3395 bcopy(&clstat_debug, (char *)buf + sizeof (clstat_tmpl),
3396 sizeof (clstat_debug));
3397 #endif
3399 return (0);
3402 static void *
3403 clinit_zone(zoneid_t zoneid)
3405 kstat_t *nfs_client_kstat;
3406 struct nfs_clnt *nfscl;
3407 uint_t ndata;
3409 nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3410 mutex_init(&nfscl->nfscl_chtable_lock, NULL, MUTEX_DEFAULT, NULL);
3411 nfscl->nfscl_chtable = NULL;
3412 nfscl->nfscl_zoneid = zoneid;
3414 bcopy(&clstat_tmpl, &nfscl->nfscl_stat, sizeof (clstat_tmpl));
3415 ndata = sizeof (clstat_tmpl) / sizeof (kstat_named_t);
3416 #ifdef DEBUG
3417 ndata += sizeof (clstat_debug) / sizeof (kstat_named_t);
3418 #endif
3419 if ((nfs_client_kstat = kstat_create_zone("nfs", 0, "nfs_client",
3420 "misc", KSTAT_TYPE_NAMED, ndata,
3421 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3422 nfs_client_kstat->ks_private = &nfscl->nfscl_stat;
3423 nfs_client_kstat->ks_snapshot = cl_snapshot;
3424 kstat_install(nfs_client_kstat);
3426 mutex_enter(&nfs_clnt_list_lock);
3427 list_insert_head(&nfs_clnt_list, nfscl);
3428 mutex_exit(&nfs_clnt_list_lock);
3429 return (nfscl);
3432 /*ARGSUSED*/
3433 static void
3434 clfini_zone(zoneid_t zoneid, void *arg)
3436 struct nfs_clnt *nfscl = arg;
3437 chhead_t *chp, *next;
3439 if (nfscl == NULL)
3440 return;
3441 mutex_enter(&nfs_clnt_list_lock);
3442 list_remove(&nfs_clnt_list, nfscl);
3443 mutex_exit(&nfs_clnt_list_lock);
3444 clreclaim_zone(nfscl, 0);
3445 for (chp = nfscl->nfscl_chtable; chp != NULL; chp = next) {
3446 ASSERT(chp->ch_list == NULL);
3447 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3448 next = chp->ch_next;
3449 kmem_free(chp, sizeof (*chp));
3451 kstat_delete_byname_zone("nfs", 0, "nfs_client", zoneid);
3452 mutex_destroy(&nfscl->nfscl_chtable_lock);
3453 kmem_free(nfscl, sizeof (*nfscl));
3457 * Called by endpnt_destructor to make sure the client handles are
3458 * cleaned up before the RPC endpoints. This becomes a no-op if
3459 * clfini_zone (above) is called first. This function is needed
3460 * (rather than relying on clfini_zone to clean up) because the ZSD
3461 * callbacks have no ordering mechanism, so we have no way to ensure
3462 * that clfini_zone is called before endpnt_destructor.
3464 void
3465 clcleanup_zone(zoneid_t zoneid)
3467 struct nfs_clnt *nfscl;
3469 mutex_enter(&nfs_clnt_list_lock);
3470 nfscl = list_head(&nfs_clnt_list);
3471 for (; nfscl != NULL; nfscl = list_next(&nfs_clnt_list, nfscl)) {
3472 if (nfscl->nfscl_zoneid == zoneid) {
3473 clreclaim_zone(nfscl, 0);
3474 break;
3477 mutex_exit(&nfs_clnt_list_lock);
3481 nfs_subrinit(void)
3483 int i;
3484 ulong_t nrnode_max;
3487 * Allocate and initialize the rnode hash queues
3489 if (nrnode <= 0)
3490 nrnode = ncsize;
3491 nrnode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode));
3492 if (nrnode > nrnode_max || (nrnode == 0 && ncsize == 0)) {
3493 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
3494 "!setting nrnode to max value of %ld", nrnode_max);
3495 nrnode = nrnode_max;
3498 rtablesize = 1 << highbit(nrnode / hashlen);
3499 rtablemask = rtablesize - 1;
3500 rtable = kmem_alloc(rtablesize * sizeof (*rtable), KM_SLEEP);
3501 for (i = 0; i < rtablesize; i++) {
3502 rtable[i].r_hashf = (rnode_t *)(&rtable[i]);
3503 rtable[i].r_hashb = (rnode_t *)(&rtable[i]);
3504 rw_init(&rtable[i].r_lock, NULL, RW_DEFAULT, NULL);
3506 rnode_cache = kmem_cache_create("rnode_cache", sizeof (rnode_t),
3507 0, NULL, NULL, nfs_reclaim, NULL, NULL, 0);
3510 * Allocate and initialize the access cache
3514 * Initial guess is one access cache entry per rnode unless
3515 * nacache is set to a non-zero value and then it is used to
3516 * indicate a guess at the number of access cache entries.
3518 if (nacache > 0)
3519 acachesize = 1 << highbit(nacache / hashlen);
3520 else
3521 acachesize = rtablesize;
3522 acachemask = acachesize - 1;
3523 acache = kmem_alloc(acachesize * sizeof (*acache), KM_SLEEP);
3524 for (i = 0; i < acachesize; i++) {
3525 acache[i].next = (acache_t *)&acache[i];
3526 acache[i].prev = (acache_t *)&acache[i];
3527 rw_init(&acache[i].lock, NULL, RW_DEFAULT, NULL);
3529 acache_cache = kmem_cache_create("nfs_access_cache",
3530 sizeof (acache_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3532 * Allocate and initialize the client handle cache
3534 chtab_cache = kmem_cache_create("client_handle_cache",
3535 sizeof (struct chtab), 0, NULL, NULL, clreclaim, NULL, NULL, 0);
3537 * Initialize the list of per-zone client handles (and associated data).
3538 * This needs to be done before we call zone_key_create().
3540 list_create(&nfs_clnt_list, sizeof (struct nfs_clnt),
3541 offsetof(struct nfs_clnt, nfscl_node));
3543 * Initialize the zone_key for per-zone client handle lists.
3545 zone_key_create(&nfsclnt_zone_key, clinit_zone, NULL, clfini_zone);
3547 * Initialize the various mutexes and reader/writer locks
3549 mutex_init(&rpfreelist_lock, NULL, MUTEX_DEFAULT, NULL);
3550 mutex_init(&newnum_lock, NULL, MUTEX_DEFAULT, NULL);
3551 mutex_init(&nfs_minor_lock, NULL, MUTEX_DEFAULT, NULL);
3554 * Assign unique major number for all nfs mounts
3556 if ((nfs_major = getudev()) == -1) {
3557 zcmn_err(GLOBAL_ZONEID, CE_WARN,
3558 "nfs: init: can't get unique device number");
3559 nfs_major = 0;
3561 nfs_minor = 0;
3563 if (nfs3_jukebox_delay == 0)
3564 nfs3_jukebox_delay = NFS3_JUKEBOX_DELAY;
3566 return (0);
3569 void
3570 nfs_subrfini(void)
3572 int i;
3575 * Deallocate the rnode hash queues
3577 kmem_cache_destroy(rnode_cache);
3579 for (i = 0; i < rtablesize; i++)
3580 rw_destroy(&rtable[i].r_lock);
3581 kmem_free(rtable, rtablesize * sizeof (*rtable));
3584 * Deallocated the access cache
3586 kmem_cache_destroy(acache_cache);
3588 for (i = 0; i < acachesize; i++)
3589 rw_destroy(&acache[i].lock);
3590 kmem_free(acache, acachesize * sizeof (*acache));
3593 * Deallocate the client handle cache
3595 kmem_cache_destroy(chtab_cache);
3598 * Destroy the various mutexes and reader/writer locks
3600 mutex_destroy(&rpfreelist_lock);
3601 mutex_destroy(&newnum_lock);
3602 mutex_destroy(&nfs_minor_lock);
3603 (void) zone_key_delete(nfsclnt_zone_key);
3606 enum nfsstat
3607 puterrno(int error)
3610 switch (error) {
3611 case EOPNOTSUPP:
3612 return (NFSERR_OPNOTSUPP);
3613 case ENAMETOOLONG:
3614 return (NFSERR_NAMETOOLONG);
3615 case ENOTEMPTY:
3616 return (NFSERR_NOTEMPTY);
3617 case EDQUOT:
3618 return (NFSERR_DQUOT);
3619 case ESTALE:
3620 return (NFSERR_STALE);
3621 case EREMOTE:
3622 return (NFSERR_REMOTE);
3623 case ENOSYS:
3624 return (NFSERR_OPNOTSUPP);
3625 case EOVERFLOW:
3626 return (NFSERR_INVAL);
3627 default:
3628 return ((enum nfsstat)error);
3630 /* NOTREACHED */
3634 geterrno(enum nfsstat status)
3637 switch (status) {
3638 case NFSERR_OPNOTSUPP:
3639 return (EOPNOTSUPP);
3640 case NFSERR_NAMETOOLONG:
3641 return (ENAMETOOLONG);
3642 case NFSERR_NOTEMPTY:
3643 return (ENOTEMPTY);
3644 case NFSERR_DQUOT:
3645 return (EDQUOT);
3646 case NFSERR_STALE:
3647 return (ESTALE);
3648 case NFSERR_REMOTE:
3649 return (EREMOTE);
3650 case NFSERR_WFLUSH:
3651 return (EIO);
3652 default:
3653 return ((int)status);
3655 /* NOTREACHED */
3658 enum nfsstat3
3659 puterrno3(int error)
3662 #ifdef DEBUG
3663 switch (error) {
3664 case 0:
3665 return (NFS3_OK);
3666 case EPERM:
3667 return (NFS3ERR_PERM);
3668 case ENOENT:
3669 return (NFS3ERR_NOENT);
3670 case EIO:
3671 return (NFS3ERR_IO);
3672 case ENXIO:
3673 return (NFS3ERR_NXIO);
3674 case EACCES:
3675 return (NFS3ERR_ACCES);
3676 case EEXIST:
3677 return (NFS3ERR_EXIST);
3678 case EXDEV:
3679 return (NFS3ERR_XDEV);
3680 case ENODEV:
3681 return (NFS3ERR_NODEV);
3682 case ENOTDIR:
3683 return (NFS3ERR_NOTDIR);
3684 case EISDIR:
3685 return (NFS3ERR_ISDIR);
3686 case EINVAL:
3687 return (NFS3ERR_INVAL);
3688 case EFBIG:
3689 return (NFS3ERR_FBIG);
3690 case ENOSPC:
3691 return (NFS3ERR_NOSPC);
3692 case EROFS:
3693 return (NFS3ERR_ROFS);
3694 case EMLINK:
3695 return (NFS3ERR_MLINK);
3696 case ENAMETOOLONG:
3697 return (NFS3ERR_NAMETOOLONG);
3698 case ENOTEMPTY:
3699 return (NFS3ERR_NOTEMPTY);
3700 case EDQUOT:
3701 return (NFS3ERR_DQUOT);
3702 case ESTALE:
3703 return (NFS3ERR_STALE);
3704 case EREMOTE:
3705 return (NFS3ERR_REMOTE);
3706 case ENOSYS:
3707 case EOPNOTSUPP:
3708 return (NFS3ERR_NOTSUPP);
3709 case EOVERFLOW:
3710 return (NFS3ERR_INVAL);
3711 default:
3712 zcmn_err(getzoneid(), CE_WARN,
3713 "puterrno3: got error %d", error);
3714 return ((enum nfsstat3)error);
3716 #else
3717 switch (error) {
3718 case ENAMETOOLONG:
3719 return (NFS3ERR_NAMETOOLONG);
3720 case ENOTEMPTY:
3721 return (NFS3ERR_NOTEMPTY);
3722 case EDQUOT:
3723 return (NFS3ERR_DQUOT);
3724 case ESTALE:
3725 return (NFS3ERR_STALE);
3726 case ENOSYS:
3727 case EOPNOTSUPP:
3728 return (NFS3ERR_NOTSUPP);
3729 case EREMOTE:
3730 return (NFS3ERR_REMOTE);
3731 case EOVERFLOW:
3732 return (NFS3ERR_INVAL);
3733 default:
3734 return ((enum nfsstat3)error);
3736 #endif
3740 geterrno3(enum nfsstat3 status)
3743 #ifdef DEBUG
3744 switch (status) {
3745 case NFS3_OK:
3746 return (0);
3747 case NFS3ERR_PERM:
3748 return (EPERM);
3749 case NFS3ERR_NOENT:
3750 return (ENOENT);
3751 case NFS3ERR_IO:
3752 return (EIO);
3753 case NFS3ERR_NXIO:
3754 return (ENXIO);
3755 case NFS3ERR_ACCES:
3756 return (EACCES);
3757 case NFS3ERR_EXIST:
3758 return (EEXIST);
3759 case NFS3ERR_XDEV:
3760 return (EXDEV);
3761 case NFS3ERR_NODEV:
3762 return (ENODEV);
3763 case NFS3ERR_NOTDIR:
3764 return (ENOTDIR);
3765 case NFS3ERR_ISDIR:
3766 return (EISDIR);
3767 case NFS3ERR_INVAL:
3768 return (EINVAL);
3769 case NFS3ERR_FBIG:
3770 return (EFBIG);
3771 case NFS3ERR_NOSPC:
3772 return (ENOSPC);
3773 case NFS3ERR_ROFS:
3774 return (EROFS);
3775 case NFS3ERR_MLINK:
3776 return (EMLINK);
3777 case NFS3ERR_NAMETOOLONG:
3778 return (ENAMETOOLONG);
3779 case NFS3ERR_NOTEMPTY:
3780 return (ENOTEMPTY);
3781 case NFS3ERR_DQUOT:
3782 return (EDQUOT);
3783 case NFS3ERR_STALE:
3784 return (ESTALE);
3785 case NFS3ERR_REMOTE:
3786 return (EREMOTE);
3787 case NFS3ERR_BADHANDLE:
3788 return (ESTALE);
3789 case NFS3ERR_NOT_SYNC:
3790 return (EINVAL);
3791 case NFS3ERR_BAD_COOKIE:
3792 return (ENOENT);
3793 case NFS3ERR_NOTSUPP:
3794 return (EOPNOTSUPP);
3795 case NFS3ERR_TOOSMALL:
3796 return (EINVAL);
3797 case NFS3ERR_SERVERFAULT:
3798 return (EIO);
3799 case NFS3ERR_BADTYPE:
3800 return (EINVAL);
3801 case NFS3ERR_JUKEBOX:
3802 return (ENXIO);
3803 default:
3804 zcmn_err(getzoneid(), CE_WARN,
3805 "geterrno3: got status %d", status);
3806 return ((int)status);
3808 #else
3809 switch (status) {
3810 case NFS3ERR_NAMETOOLONG:
3811 return (ENAMETOOLONG);
3812 case NFS3ERR_NOTEMPTY:
3813 return (ENOTEMPTY);
3814 case NFS3ERR_DQUOT:
3815 return (EDQUOT);
3816 case NFS3ERR_STALE:
3817 case NFS3ERR_BADHANDLE:
3818 return (ESTALE);
3819 case NFS3ERR_NOTSUPP:
3820 return (EOPNOTSUPP);
3821 case NFS3ERR_REMOTE:
3822 return (EREMOTE);
3823 case NFS3ERR_NOT_SYNC:
3824 case NFS3ERR_TOOSMALL:
3825 case NFS3ERR_BADTYPE:
3826 return (EINVAL);
3827 case NFS3ERR_BAD_COOKIE:
3828 return (ENOENT);
3829 case NFS3ERR_SERVERFAULT:
3830 return (EIO);
3831 case NFS3ERR_JUKEBOX:
3832 return (ENXIO);
3833 default:
3834 return ((int)status);
3836 #endif
3839 rddir_cache *
3840 rddir_cache_alloc(int flags)
3842 rddir_cache *rc;
3844 rc = kmem_alloc(sizeof (*rc), flags);
3845 if (rc != NULL) {
3846 rc->entries = NULL;
3847 rc->flags = RDDIR;
3848 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
3849 mutex_init(&rc->lock, NULL, MUTEX_DEFAULT, NULL);
3850 rc->count = 1;
3851 #ifdef DEBUG
3852 atomic_inc_64(&clstat_debug.dirent.value.ui64);
3853 #endif
3855 return (rc);
3858 static void
3859 rddir_cache_free(rddir_cache *rc)
3862 #ifdef DEBUG
3863 atomic_dec_64(&clstat_debug.dirent.value.ui64);
3864 #endif
3865 if (rc->entries != NULL) {
3866 #ifdef DEBUG
3867 rddir_cache_buf_free(rc->entries, rc->buflen);
3868 #else
3869 kmem_free(rc->entries, rc->buflen);
3870 #endif
3872 cv_destroy(&rc->cv);
3873 mutex_destroy(&rc->lock);
3874 kmem_free(rc, sizeof (*rc));
3877 void
3878 rddir_cache_hold(rddir_cache *rc)
3881 mutex_enter(&rc->lock);
3882 rc->count++;
3883 mutex_exit(&rc->lock);
3886 void
3887 rddir_cache_rele(rddir_cache *rc)
3890 mutex_enter(&rc->lock);
3891 ASSERT(rc->count > 0);
3892 if (--rc->count == 0) {
3893 mutex_exit(&rc->lock);
3894 rddir_cache_free(rc);
3895 } else
3896 mutex_exit(&rc->lock);
3899 #ifdef DEBUG
3900 char *
3901 rddir_cache_buf_alloc(size_t size, int flags)
3903 char *rc;
3905 rc = kmem_alloc(size, flags);
3906 if (rc != NULL)
3907 atomic_add_64(&clstat_debug.dirents.value.ui64, size);
3908 return (rc);
3911 void
3912 rddir_cache_buf_free(void *addr, size_t size)
3915 atomic_add_64(&clstat_debug.dirents.value.ui64, -(int64_t)size);
3916 kmem_free(addr, size);
3918 #endif
3920 static int
3921 nfs_free_data_reclaim(rnode_t *rp)
3923 char *contents;
3924 int size;
3925 vsecattr_t *vsp;
3926 nfs3_pathconf_info *info;
3927 int freed;
3928 cred_t *cred;
3931 * Free any held credentials and caches which
3932 * may be associated with this rnode.
3934 mutex_enter(&rp->r_statelock);
3935 cred = rp->r_cred;
3936 rp->r_cred = NULL;
3937 contents = rp->r_symlink.contents;
3938 size = rp->r_symlink.size;
3939 rp->r_symlink.contents = NULL;
3940 vsp = rp->r_secattr;
3941 rp->r_secattr = NULL;
3942 info = rp->r_pathconf;
3943 rp->r_pathconf = NULL;
3944 mutex_exit(&rp->r_statelock);
3946 if (cred != NULL)
3947 crfree(cred);
3950 * Free the access cache entries.
3952 freed = nfs_access_purge_rp(rp);
3954 if (!HAVE_RDDIR_CACHE(rp) &&
3955 contents == NULL &&
3956 vsp == NULL &&
3957 info == NULL)
3958 return (freed);
3961 * Free the readdir cache entries
3963 if (HAVE_RDDIR_CACHE(rp))
3964 nfs_purge_rddir_cache(RTOV(rp));
3967 * Free the symbolic link cache.
3969 if (contents != NULL) {
3971 kmem_free((void *)contents, size);
3975 * Free any cached ACL.
3977 if (vsp != NULL)
3978 nfs_acl_free(vsp);
3981 * Free any cached pathconf information.
3983 if (info != NULL)
3984 kmem_free(info, sizeof (*info));
3986 return (1);
3989 static int
3990 nfs_active_data_reclaim(rnode_t *rp)
3992 char *contents;
3993 int size;
3994 vsecattr_t *vsp;
3995 nfs3_pathconf_info *info;
3996 int freed;
3999 * Free any held credentials and caches which
4000 * may be associated with this rnode.
4002 if (!mutex_tryenter(&rp->r_statelock))
4003 return (0);
4004 contents = rp->r_symlink.contents;
4005 size = rp->r_symlink.size;
4006 rp->r_symlink.contents = NULL;
4007 vsp = rp->r_secattr;
4008 rp->r_secattr = NULL;
4009 info = rp->r_pathconf;
4010 rp->r_pathconf = NULL;
4011 mutex_exit(&rp->r_statelock);
4014 * Free the access cache entries.
4016 freed = nfs_access_purge_rp(rp);
4018 if (!HAVE_RDDIR_CACHE(rp) &&
4019 contents == NULL &&
4020 vsp == NULL &&
4021 info == NULL)
4022 return (freed);
4025 * Free the readdir cache entries
4027 if (HAVE_RDDIR_CACHE(rp))
4028 nfs_purge_rddir_cache(RTOV(rp));
4031 * Free the symbolic link cache.
4033 if (contents != NULL) {
4035 kmem_free((void *)contents, size);
4039 * Free any cached ACL.
4041 if (vsp != NULL)
4042 nfs_acl_free(vsp);
4045 * Free any cached pathconf information.
4047 if (info != NULL)
4048 kmem_free(info, sizeof (*info));
4050 return (1);
4053 static int
4054 nfs_free_reclaim(void)
4056 int freed;
4057 rnode_t *rp;
4059 #ifdef DEBUG
4060 clstat_debug.f_reclaim.value.ui64++;
4061 #endif
4062 freed = 0;
4063 mutex_enter(&rpfreelist_lock);
4064 rp = rpfreelist;
4065 if (rp != NULL) {
4066 do {
4067 if (nfs_free_data_reclaim(rp))
4068 freed = 1;
4069 } while ((rp = rp->r_freef) != rpfreelist);
4071 mutex_exit(&rpfreelist_lock);
4072 return (freed);
4075 static int
4076 nfs_active_reclaim(void)
4078 int freed;
4079 int index;
4080 rnode_t *rp;
4082 #ifdef DEBUG
4083 clstat_debug.a_reclaim.value.ui64++;
4084 #endif
4085 freed = 0;
4086 for (index = 0; index < rtablesize; index++) {
4087 rw_enter(&rtable[index].r_lock, RW_READER);
4088 for (rp = rtable[index].r_hashf;
4089 rp != (rnode_t *)(&rtable[index]);
4090 rp = rp->r_hashf) {
4091 if (nfs_active_data_reclaim(rp))
4092 freed = 1;
4094 rw_exit(&rtable[index].r_lock);
4096 return (freed);
4099 static int
4100 nfs_rnode_reclaim(void)
4102 int freed;
4103 rnode_t *rp;
4104 vnode_t *vp;
4106 #ifdef DEBUG
4107 clstat_debug.r_reclaim.value.ui64++;
4108 #endif
4109 freed = 0;
4110 mutex_enter(&rpfreelist_lock);
4111 while ((rp = rpfreelist) != NULL) {
4112 rp_rmfree(rp);
4113 mutex_exit(&rpfreelist_lock);
4114 if (rp->r_flags & RHASHED) {
4115 vp = RTOV(rp);
4116 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4117 mutex_enter(&vp->v_lock);
4118 if (vp->v_count > 1) {
4119 VN_RELE_LOCKED(vp);
4120 mutex_exit(&vp->v_lock);
4121 rw_exit(&rp->r_hashq->r_lock);
4122 mutex_enter(&rpfreelist_lock);
4123 continue;
4125 mutex_exit(&vp->v_lock);
4126 rp_rmhash_locked(rp);
4127 rw_exit(&rp->r_hashq->r_lock);
4130 * This call to rp_addfree will end up destroying the
4131 * rnode, but in a safe way with the appropriate set
4132 * of checks done.
4134 rp_addfree(rp, CRED());
4135 mutex_enter(&rpfreelist_lock);
4137 mutex_exit(&rpfreelist_lock);
4138 return (freed);
4141 /*ARGSUSED*/
4142 static void
4143 nfs_reclaim(void *cdrarg)
4146 #ifdef DEBUG
4147 clstat_debug.reclaim.value.ui64++;
4148 #endif
4149 if (nfs_free_reclaim())
4150 return;
4152 if (nfs_active_reclaim())
4153 return;
4155 (void) nfs_rnode_reclaim();
4159 * NFS client failover support
4161 * Routines to copy filehandles
4163 void
4164 nfscopyfh(caddr_t fhp, vnode_t *vp)
4166 fhandle_t *dest = (fhandle_t *)fhp;
4168 if (dest != NULL)
4169 *dest = *VTOFH(vp);
4172 void
4173 nfs3copyfh(caddr_t fhp, vnode_t *vp)
4175 nfs_fh3 *dest = (nfs_fh3 *)fhp;
4177 if (dest != NULL)
4178 *dest = *VTOFH3(vp);
4182 * NFS client failover support
4184 * failover_safe() will test various conditions to ensure that
4185 * failover is permitted for this vnode. It will be denied
4186 * if:
4187 * 1) the operation in progress does not support failover (NULL fi)
4188 * 2) there are no available replicas (NULL mi_servers->sv_next)
4189 * 3) any locks are outstanding on this file
4191 static int
4192 failover_safe(failinfo_t *fi)
4196 * Does this op permit failover?
4198 if (fi == NULL || fi->vp == NULL)
4199 return (0);
4202 * Are there any alternates to failover to?
4204 if (VTOMI(fi->vp)->mi_servers->sv_next == NULL)
4205 return (0);
4208 * Disable check; we've forced local locking
4210 * if (flk_has_remote_locks(fi->vp))
4211 * return (0);
4215 * If we have no partial path, we can't do anything
4217 if (VTOR(fi->vp)->r_path == NULL)
4218 return (0);
4220 return (1);
4223 #include <sys/thread.h>
4226 * NFS client failover support
4228 * failover_newserver() will start a search for a new server,
4229 * preferably by starting an async thread to do the work. If
4230 * someone is already doing this (recognizable by MI_BINDINPROG
4231 * being set), it will simply return and the calling thread
4232 * will queue on the mi_failover_cv condition variable.
4234 static void
4235 failover_newserver(mntinfo_t *mi)
4238 * Check if someone else is doing this already
4240 mutex_enter(&mi->mi_lock);
4241 if (mi->mi_flags & MI_BINDINPROG) {
4242 mutex_exit(&mi->mi_lock);
4243 return;
4245 mi->mi_flags |= MI_BINDINPROG;
4248 * Need to hold the vfs struct so that it can't be released
4249 * while the failover thread is selecting a new server.
4251 VFS_HOLD(mi->mi_vfsp);
4254 * Start a thread to do the real searching.
4256 (void) zthread_create(NULL, 0, failover_thread, mi, 0, minclsyspri);
4258 mutex_exit(&mi->mi_lock);
4262 * NFS client failover support
4264 * failover_thread() will find a new server to replace the one
4265 * currently in use, wake up other threads waiting on this mount
4266 * point, and die. It will start at the head of the server list
4267 * and poll servers until it finds one with an NFS server which is
4268 * registered and responds to a NULL procedure ping.
4270 * XXX failover_thread is unsafe within the scope of the
4271 * present model defined for cpr to suspend the system.
4272 * Specifically, over-the-wire calls made by the thread
4273 * are unsafe. The thread needs to be reevaluated in case of
4274 * future updates to the cpr suspend model.
4276 static void
4277 failover_thread(mntinfo_t *mi)
4279 servinfo_t *svp = NULL;
4280 CLIENT *cl;
4281 enum clnt_stat status;
4282 struct timeval tv;
4283 int error;
4284 int oncethru = 0;
4285 callb_cpr_t cprinfo;
4286 rnode_t *rp;
4287 int index;
4288 char *srvnames;
4289 size_t srvnames_len;
4290 struct nfs_clnt *nfscl = NULL;
4291 zoneid_t zoneid = getzoneid();
4293 #ifdef DEBUG
4295 * This is currently only needed to access counters which exist on
4296 * DEBUG kernels, hence we don't want to pay the penalty of the lookup
4297 * on non-DEBUG kernels.
4299 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4300 ASSERT(nfscl != NULL);
4301 #endif
4304 * Its safe to piggyback on the mi_lock since failover_newserver()
4305 * code guarantees that there will be only one failover thread
4306 * per mountinfo at any instance.
4308 CALLB_CPR_INIT(&cprinfo, &mi->mi_lock, callb_generic_cpr,
4309 "failover_thread");
4311 mutex_enter(&mi->mi_lock);
4312 while (mi->mi_readers) {
4313 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4314 cv_wait(&mi->mi_failover_cv, &mi->mi_lock);
4315 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4317 mutex_exit(&mi->mi_lock);
4319 tv.tv_sec = 2;
4320 tv.tv_usec = 0;
4323 * Ping the null NFS procedure of every server in
4324 * the list until one responds. We always start
4325 * at the head of the list and always skip the one
4326 * that is current, since it's caused us a problem.
4328 while (svp == NULL) {
4329 for (svp = mi->mi_servers; svp; svp = svp->sv_next) {
4330 if (!oncethru && svp == mi->mi_curr_serv)
4331 continue;
4334 * If the file system was forcibly umounted
4335 * while trying to do a failover, then just
4336 * give up on the failover. It won't matter
4337 * what the server is.
4339 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) {
4340 svp = NULL;
4341 goto done;
4344 error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
4345 NFS_PROGRAM, NFS_VERSION, 0, 1, CRED(), &cl);
4346 if (error)
4347 continue;
4349 if (!(mi->mi_flags & MI_INT))
4350 cl->cl_nosignal = TRUE;
4351 status = CLNT_CALL(cl, RFS_NULL, xdr_void, NULL,
4352 xdr_void, NULL, tv);
4353 if (!(mi->mi_flags & MI_INT))
4354 cl->cl_nosignal = FALSE;
4355 AUTH_DESTROY(cl->cl_auth);
4356 CLNT_DESTROY(cl);
4357 if (status == RPC_SUCCESS) {
4358 if (svp == mi->mi_curr_serv) {
4359 #ifdef DEBUG
4360 zcmn_err(zoneid, CE_NOTE,
4361 "NFS%d: failing over: selecting original server %s",
4362 mi->mi_vers, svp->sv_hostname);
4363 #else
4364 zcmn_err(zoneid, CE_NOTE,
4365 "NFS: failing over: selecting original server %s",
4366 svp->sv_hostname);
4367 #endif
4368 } else {
4369 #ifdef DEBUG
4370 zcmn_err(zoneid, CE_NOTE,
4371 "NFS%d: failing over from %s to %s",
4372 mi->mi_vers,
4373 mi->mi_curr_serv->sv_hostname,
4374 svp->sv_hostname);
4375 #else
4376 zcmn_err(zoneid, CE_NOTE,
4377 "NFS: failing over from %s to %s",
4378 mi->mi_curr_serv->sv_hostname,
4379 svp->sv_hostname);
4380 #endif
4382 break;
4386 if (svp == NULL) {
4387 if (!oncethru) {
4388 srvnames = nfs_getsrvnames(mi, &srvnames_len);
4389 #ifdef DEBUG
4390 zprintf(zoneid,
4391 "NFS%d servers %s not responding "
4392 "still trying\n", mi->mi_vers, srvnames);
4393 #else
4394 zprintf(zoneid, "NFS servers %s not responding "
4395 "still trying\n", srvnames);
4396 #endif
4397 oncethru = 1;
4399 mutex_enter(&mi->mi_lock);
4400 CALLB_CPR_SAFE_BEGIN(&cprinfo);
4401 mutex_exit(&mi->mi_lock);
4402 ddi_sleep(1);
4403 mutex_enter(&mi->mi_lock);
4404 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_lock);
4405 mutex_exit(&mi->mi_lock);
4409 if (oncethru) {
4410 #ifdef DEBUG
4411 zprintf(zoneid, "NFS%d servers %s ok\n", mi->mi_vers, srvnames);
4412 #else
4413 zprintf(zoneid, "NFS servers %s ok\n", srvnames);
4414 #endif
4417 if (svp != mi->mi_curr_serv) {
4418 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
4419 index = rtablehash(&mi->mi_curr_serv->sv_fhandle);
4420 rw_enter(&rtable[index].r_lock, RW_WRITER);
4421 rp = rfind(&rtable[index], &mi->mi_curr_serv->sv_fhandle,
4422 mi->mi_vfsp);
4423 if (rp != NULL) {
4424 if (rp->r_flags & RHASHED)
4425 rp_rmhash_locked(rp);
4426 rw_exit(&rtable[index].r_lock);
4427 rp->r_server = svp;
4428 rp->r_fh = svp->sv_fhandle;
4429 (void) nfs_free_data_reclaim(rp);
4430 index = rtablehash(&rp->r_fh);
4431 rp->r_hashq = &rtable[index];
4432 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4433 vn_exists(RTOV(rp));
4434 rp_addhash(rp);
4435 rw_exit(&rp->r_hashq->r_lock);
4436 VN_RELE(RTOV(rp));
4437 } else
4438 rw_exit(&rtable[index].r_lock);
4441 done:
4442 if (oncethru)
4443 kmem_free(srvnames, srvnames_len);
4444 mutex_enter(&mi->mi_lock);
4445 mi->mi_flags &= ~MI_BINDINPROG;
4446 if (svp != NULL) {
4447 mi->mi_curr_serv = svp;
4448 mi->mi_failover++;
4449 #ifdef DEBUG
4450 nfscl->nfscl_stat.failover.value.ui64++;
4451 #endif
4453 cv_broadcast(&mi->mi_failover_cv);
4454 CALLB_CPR_EXIT(&cprinfo);
4455 VFS_RELE(mi->mi_vfsp);
4456 zthread_exit();
4457 /* NOTREACHED */
4461 * NFS client failover support
4463 * failover_wait() will put the thread to sleep until MI_BINDINPROG
4464 * is cleared, meaning that failover is complete. Called with
4465 * mi_lock mutex held.
4467 static int
4468 failover_wait(mntinfo_t *mi)
4470 k_sigset_t smask;
4473 * If someone else is hunting for a living server,
4474 * sleep until it's done. After our sleep, we may
4475 * be bound to the right server and get off cheaply.
4477 while (mi->mi_flags & MI_BINDINPROG) {
4479 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
4480 * and SIGTERM. (Preserving the existing masks).
4481 * Mask out SIGINT if mount option nointr is specified.
4483 sigintr(&smask, (int)mi->mi_flags & MI_INT);
4484 if (!cv_wait_sig(&mi->mi_failover_cv, &mi->mi_lock)) {
4486 * restore original signal mask
4488 sigunintr(&smask);
4489 return (EINTR);
4492 * restore original signal mask
4494 sigunintr(&smask);
4496 return (0);
4500 * NFS client failover support
4502 * failover_remap() will do a partial pathname lookup and find the
4503 * desired vnode on the current server. The interim vnode will be
4504 * discarded after we pilfer the new filehandle.
4506 * Side effects:
4507 * - This routine will also update the filehandle in the args structure
4508 * pointed to by the fi->fhp pointer if it is non-NULL.
4511 static int
4512 failover_remap(failinfo_t *fi)
4514 vnode_t *vp, *nvp, *rootvp;
4515 rnode_t *rp, *nrp;
4516 mntinfo_t *mi;
4517 int error;
4518 #ifdef DEBUG
4519 struct nfs_clnt *nfscl;
4521 nfscl = zone_getspecific(nfsclnt_zone_key, nfs_zone());
4522 ASSERT(nfscl != NULL);
4523 #endif
4525 * Sanity check
4527 if (fi == NULL || fi->vp == NULL || fi->lookupproc == NULL)
4528 return (EINVAL);
4529 vp = fi->vp;
4530 rp = VTOR(vp);
4531 mi = VTOMI(vp);
4533 if (!(vp->v_flag & VROOT)) {
4535 * Given the root fh, use the path stored in
4536 * the rnode to find the fh for the new server.
4538 error = VFS_ROOT(mi->mi_vfsp, &rootvp);
4539 if (error)
4540 return (error);
4542 error = failover_lookup(rp->r_path, rootvp,
4543 fi->lookupproc, fi->xattrdirproc, &nvp);
4545 VN_RELE(rootvp);
4547 if (error)
4548 return (error);
4551 * If we found the same rnode, we're done now
4553 if (nvp == vp) {
4555 * Failed and the new server may physically be same
4556 * OR may share a same disk subsystem. In this case
4557 * file handle for a particular file path is not going
4558 * to change, given the same filehandle lookup will
4559 * always locate the same rnode as the existing one.
4560 * All we might need to do is to update the r_server
4561 * with the current servinfo.
4563 if (!VALID_FH(fi)) {
4564 rp->r_server = mi->mi_curr_serv;
4566 VN_RELE(nvp);
4567 return (0);
4571 * Try to make it so that no one else will find this
4572 * vnode because it is just a temporary to hold the
4573 * new file handle until that file handle can be
4574 * copied to the original vnode/rnode.
4576 nrp = VTOR(nvp);
4577 mutex_enter(&mi->mi_remap_lock);
4579 * Some other thread could have raced in here and could
4580 * have done the remap for this particular rnode before
4581 * this thread here. Check for rp->r_server and
4582 * mi->mi_curr_serv and return if they are same.
4584 if (VALID_FH(fi)) {
4585 mutex_exit(&mi->mi_remap_lock);
4586 VN_RELE(nvp);
4587 return (0);
4590 if (nrp->r_flags & RHASHED)
4591 rp_rmhash(nrp);
4594 * As a heuristic check on the validity of the new
4595 * file, check that the size and type match against
4596 * that we remember from the old version.
4598 if (rp->r_size != nrp->r_size || vp->v_type != nvp->v_type) {
4599 mutex_exit(&mi->mi_remap_lock);
4600 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
4601 "NFS replicas %s and %s: file %s not same.",
4602 rp->r_server->sv_hostname,
4603 nrp->r_server->sv_hostname, rp->r_path);
4604 VN_RELE(nvp);
4605 return (EINVAL);
4609 * snarf the filehandle from the new rnode
4610 * then release it, again while updating the
4611 * hash queues for the rnode.
4613 if (rp->r_flags & RHASHED)
4614 rp_rmhash(rp);
4615 rp->r_server = mi->mi_curr_serv;
4616 rp->r_fh = nrp->r_fh;
4617 rp->r_hashq = nrp->r_hashq;
4619 * Copy the attributes from the new rnode to the old
4620 * rnode. This will help to reduce unnecessary page
4621 * cache flushes.
4623 rp->r_attr = nrp->r_attr;
4624 rp->r_attrtime = nrp->r_attrtime;
4625 rp->r_mtime = nrp->r_mtime;
4626 (void) nfs_free_data_reclaim(rp);
4627 nfs_setswaplike(vp, &rp->r_attr);
4628 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
4629 rp_addhash(rp);
4630 rw_exit(&rp->r_hashq->r_lock);
4631 mutex_exit(&mi->mi_remap_lock);
4632 VN_RELE(nvp);
4636 * Update successful failover remap count
4638 mutex_enter(&mi->mi_lock);
4639 mi->mi_remap++;
4640 mutex_exit(&mi->mi_lock);
4641 #ifdef DEBUG
4642 nfscl->nfscl_stat.remap.value.ui64++;
4643 #endif
4646 * If we have a copied filehandle to update, do it now.
4648 if (fi->fhp != NULL && fi->copyproc != NULL)
4649 (*fi->copyproc)(fi->fhp, vp);
4651 return (0);
4655 * NFS client failover support
4657 * We want a simple pathname lookup routine to parse the pieces
4658 * of path in rp->r_path. We know that the path was a created
4659 * as rnodes were made, so we know we have only to deal with
4660 * paths that look like:
4661 * dir1/dir2/dir3/file
4662 * Any evidence of anything like .., symlinks, and ENOTDIR
4663 * are hard errors, because they mean something in this filesystem
4664 * is different from the one we came from, or has changed under
4665 * us in some way. If this is true, we want the failure.
4667 * Extended attributes: if the filesystem is mounted with extended
4668 * attributes enabled (-o xattr), the attribute directory will be
4669 * represented in the r_path as the magic name XATTR_RPATH. So if
4670 * we see that name in the pathname, is must be because this node
4671 * is an extended attribute. Therefore, look it up that way.
4673 static int
4674 failover_lookup(char *path, vnode_t *root,
4675 int (*lookupproc)(vnode_t *, char *, vnode_t **, struct pathname *, int,
4676 vnode_t *, cred_t *, int),
4677 int (*xattrdirproc)(vnode_t *, vnode_t **, bool_t, cred_t *, int),
4678 vnode_t **new)
4680 vnode_t *dvp, *nvp;
4681 int error = EINVAL;
4682 char *s, *p, *tmppath;
4683 size_t len;
4684 mntinfo_t *mi;
4685 bool_t xattr;
4687 /* Make local copy of path */
4688 len = strlen(path) + 1;
4689 tmppath = kmem_alloc(len, KM_SLEEP);
4690 (void) strcpy(tmppath, path);
4691 s = tmppath;
4693 dvp = root;
4694 VN_HOLD(dvp);
4695 mi = VTOMI(root);
4696 xattr = mi->mi_flags & MI_EXTATTR;
4698 do {
4699 p = strchr(s, '/');
4700 if (p != NULL)
4701 *p = '\0';
4702 if (xattr && strcmp(s, XATTR_RPATH) == 0) {
4703 error = (*xattrdirproc)(dvp, &nvp, FALSE, CRED(),
4704 RFSCALL_SOFT);
4705 } else {
4706 error = (*lookupproc)(dvp, s, &nvp, NULL, 0, NULL,
4707 CRED(), RFSCALL_SOFT);
4709 if (p != NULL)
4710 *p++ = '/';
4711 if (error) {
4712 VN_RELE(dvp);
4713 kmem_free(tmppath, len);
4714 return (error);
4716 s = p;
4717 VN_RELE(dvp);
4718 dvp = nvp;
4719 } while (p != NULL);
4721 if (nvp != NULL && new != NULL)
4722 *new = nvp;
4723 kmem_free(tmppath, len);
4724 return (0);
4728 * NFS client failover support
4730 * sv_free() frees the malloc'd portion of a "servinfo_t".
4732 void
4733 sv_free(servinfo_t *svp)
4735 servinfo_t *next;
4736 struct knetconfig *knconf;
4738 while (svp != NULL) {
4739 next = svp->sv_next;
4740 if (svp->sv_secdata)
4741 sec_clnt_freeinfo(svp->sv_secdata);
4742 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
4743 kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
4744 knconf = svp->sv_knconf;
4745 if (knconf != NULL) {
4746 if (knconf->knc_protofmly != NULL)
4747 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4748 if (knconf->knc_proto != NULL)
4749 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4750 kmem_free(knconf, sizeof (*knconf));
4752 knconf = svp->sv_origknconf;
4753 if (knconf != NULL) {
4754 if (knconf->knc_protofmly != NULL)
4755 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
4756 if (knconf->knc_proto != NULL)
4757 kmem_free(knconf->knc_proto, KNC_STRSIZE);
4758 kmem_free(knconf, sizeof (*knconf));
4760 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
4761 kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
4762 mutex_destroy(&svp->sv_lock);
4763 kmem_free(svp, sizeof (*svp));
4764 svp = next;
4769 * Only can return non-zero if intr != 0.
4772 nfs_rw_enter_sig(nfs_rwlock_t *l, krw_t rw, int intr)
4775 mutex_enter(&l->lock);
4778 * If this is a nested enter, then allow it. There
4779 * must be as many exits as enters through.
4781 if (l->owner == curthread) {
4782 /* lock is held for writing by current thread */
4783 ASSERT(rw == RW_READER || rw == RW_WRITER);
4784 l->count--;
4785 } else if (rw == RW_READER) {
4787 * While there is a writer active or writers waiting,
4788 * then wait for them to finish up and move on. Then,
4789 * increment the count to indicate that a reader is
4790 * active.
4792 while (l->count < 0 || l->waiters > 0) {
4793 if (intr) {
4794 klwp_t *lwp = ttolwp(curthread);
4796 if (lwp != NULL)
4797 lwp->lwp_nostop++;
4798 if (cv_wait_sig(&l->cv_rd, &l->lock) == 0) {
4799 if (lwp != NULL)
4800 lwp->lwp_nostop--;
4801 mutex_exit(&l->lock);
4802 return (EINTR);
4804 if (lwp != NULL)
4805 lwp->lwp_nostop--;
4806 } else
4807 cv_wait(&l->cv_rd, &l->lock);
4809 ASSERT(l->count < INT_MAX);
4810 #ifdef DEBUG
4811 if ((l->count % 10000) == 9999)
4812 cmn_err(CE_WARN, "nfs_rw_enter_sig: count %d on"
4813 "rwlock @ %p\n", l->count, (void *)&l);
4814 #endif
4815 l->count++;
4816 } else {
4817 ASSERT(rw == RW_WRITER);
4819 * While there are readers active or a writer
4820 * active, then wait for all of the readers
4821 * to finish or for the writer to finish.
4822 * Then, set the owner field to curthread and
4823 * decrement count to indicate that a writer
4824 * is active.
4826 while (l->count != 0) {
4827 l->waiters++;
4828 if (intr) {
4829 klwp_t *lwp = ttolwp(curthread);
4831 if (lwp != NULL)
4832 lwp->lwp_nostop++;
4833 if (cv_wait_sig(&l->cv, &l->lock) == 0) {
4834 if (lwp != NULL)
4835 lwp->lwp_nostop--;
4836 l->waiters--;
4838 * If there are readers active and no
4839 * writers waiting then wake up all of
4840 * the waiting readers (if any).
4842 if (l->count > 0 && l->waiters == 0)
4843 cv_broadcast(&l->cv_rd);
4844 mutex_exit(&l->lock);
4845 return (EINTR);
4847 if (lwp != NULL)
4848 lwp->lwp_nostop--;
4849 } else
4850 cv_wait(&l->cv, &l->lock);
4851 l->waiters--;
4853 ASSERT(l->owner == NULL);
4854 l->owner = curthread;
4855 l->count--;
4858 mutex_exit(&l->lock);
4860 return (0);
4864 * If the lock is available, obtain it and return non-zero. If there is
4865 * already a conflicting lock, return 0 immediately.
4869 nfs_rw_tryenter(nfs_rwlock_t *l, krw_t rw)
4871 mutex_enter(&l->lock);
4874 * If this is a nested enter, then allow it. There
4875 * must be as many exits as enters through.
4877 if (l->owner == curthread) {
4878 /* lock is held for writing by current thread */
4879 ASSERT(rw == RW_READER || rw == RW_WRITER);
4880 l->count--;
4881 } else if (rw == RW_READER) {
4883 * If there is a writer active or writers waiting, deny the
4884 * lock. Otherwise, bump the count of readers.
4886 if (l->count < 0 || l->waiters > 0) {
4887 mutex_exit(&l->lock);
4888 return (0);
4890 l->count++;
4891 } else {
4892 ASSERT(rw == RW_WRITER);
4894 * If there are readers active or a writer active, deny the
4895 * lock. Otherwise, set the owner field to curthread and
4896 * decrement count to indicate that a writer is active.
4898 if (l->count != 0) {
4899 mutex_exit(&l->lock);
4900 return (0);
4902 ASSERT(l->owner == NULL);
4903 l->owner = curthread;
4904 l->count--;
4907 mutex_exit(&l->lock);
4909 return (1);
4912 void
4913 nfs_rw_exit(nfs_rwlock_t *l)
4916 mutex_enter(&l->lock);
4918 if (l->owner != NULL) {
4919 ASSERT(l->owner == curthread);
4922 * To release a writer lock increment count to indicate that
4923 * there is one less writer active. If this was the last of
4924 * possibly nested writer locks, then clear the owner field as
4925 * well to indicate that there is no writer active.
4927 ASSERT(l->count < 0);
4928 l->count++;
4929 if (l->count == 0) {
4930 l->owner = NULL;
4933 * If there are no writers waiting then wakeup all of
4934 * the waiting readers (if any).
4936 if (l->waiters == 0)
4937 cv_broadcast(&l->cv_rd);
4939 } else {
4941 * To release a reader lock just decrement count to indicate
4942 * that there is one less reader active.
4944 ASSERT(l->count > 0);
4945 l->count--;
4949 * If there are no readers active nor a writer active and there is a
4950 * writer waiting we need to wake up it.
4952 if (l->count == 0 && l->waiters > 0)
4953 cv_signal(&l->cv);
4954 mutex_exit(&l->lock);
4958 nfs_rw_lock_held(nfs_rwlock_t *l, krw_t rw)
4961 if (rw == RW_READER)
4962 return (l->count > 0);
4963 ASSERT(rw == RW_WRITER);
4964 return (l->count < 0);
4967 /* ARGSUSED */
4968 void
4969 nfs_rw_init(nfs_rwlock_t *l, char *name, krw_type_t type, void *arg)
4972 l->count = 0;
4973 l->waiters = 0;
4974 l->owner = NULL;
4975 mutex_init(&l->lock, NULL, MUTEX_DEFAULT, NULL);
4976 cv_init(&l->cv, NULL, CV_DEFAULT, NULL);
4977 cv_init(&l->cv_rd, NULL, CV_DEFAULT, NULL);
4980 void
4981 nfs_rw_destroy(nfs_rwlock_t *l)
4984 mutex_destroy(&l->lock);
4985 cv_destroy(&l->cv);
4986 cv_destroy(&l->cv_rd);
4990 nfs3_rddir_compar(const void *x, const void *y)
4992 rddir_cache *a = (rddir_cache *)x;
4993 rddir_cache *b = (rddir_cache *)y;
4995 if (a->nfs3_cookie == b->nfs3_cookie) {
4996 if (a->buflen == b->buflen)
4997 return (0);
4998 if (a->buflen < b->buflen)
4999 return (-1);
5000 return (1);
5003 if (a->nfs3_cookie < b->nfs3_cookie)
5004 return (-1);
5006 return (1);
5010 nfs_rddir_compar(const void *x, const void *y)
5012 rddir_cache *a = (rddir_cache *)x;
5013 rddir_cache *b = (rddir_cache *)y;
5015 if (a->nfs_cookie == b->nfs_cookie) {
5016 if (a->buflen == b->buflen)
5017 return (0);
5018 if (a->buflen < b->buflen)
5019 return (-1);
5020 return (1);
5023 if (a->nfs_cookie < b->nfs_cookie)
5024 return (-1);
5026 return (1);
5029 static char *
5030 nfs_getsrvnames(mntinfo_t *mi, size_t *len)
5032 servinfo_t *s;
5033 char *srvnames;
5034 char *namep;
5035 size_t length;
5038 * Calculate the length of the string required to hold all
5039 * of the server names plus either a comma or a null
5040 * character following each individual one.
5042 length = 0;
5043 for (s = mi->mi_servers; s != NULL; s = s->sv_next)
5044 length += s->sv_hostnamelen;
5046 srvnames = kmem_alloc(length, KM_SLEEP);
5048 namep = srvnames;
5049 for (s = mi->mi_servers; s != NULL; s = s->sv_next) {
5050 (void) strcpy(namep, s->sv_hostname);
5051 namep += s->sv_hostnamelen - 1;
5052 *namep++ = ',';
5054 *--namep = '\0';
5056 *len = length;
5058 return (srvnames);
5062 * These two functions are temporary and designed for the upgrade-workaround
5063 * only. They cannot be used for general zone-crossing NFS client support, and
5064 * will be removed shortly.
5066 * When the workaround is enabled, all NFS traffic is forced into the global
5067 * zone. These functions are called when the code needs to refer to the state
5068 * of the underlying network connection. They're not called when the function
5069 * needs to refer to the state of the process that invoked the system call.
5070 * (E.g., when checking whether the zone is shutting down during the mount()
5071 * call.)
5074 struct zone *
5075 nfs_zone(void)
5077 return (nfs_global_client_only != 0 ? global_zone : curproc->p_zone);
5080 zoneid_t
5081 nfs_zoneid(void)
5083 return (nfs_global_client_only != 0 ? GLOBAL_ZONEID : getzoneid());
5086 boolean_t
5087 nfs_has_ctty(void)
5089 boolean_t rv;
5090 mutex_enter(&curproc->p_splock);
5091 rv = (curproc->p_sessp->s_vp != NULL);
5092 mutex_exit(&curproc->p_splock);
5093 return (rv);
5097 * See if xattr directory to see if it has any generic user attributes
5100 do_xattr_exists_check(vnode_t *vp, ulong_t *valp, cred_t *cr)
5102 struct uio uio;
5103 struct iovec iov;
5104 char *dbuf;
5105 struct dirent64 *dp;
5106 size_t dlen = 8 * 1024;
5107 size_t dbuflen;
5108 int eof = 0;
5109 int error;
5111 *valp = 0;
5112 dbuf = kmem_alloc(dlen, KM_SLEEP);
5113 uio.uio_iov = &iov;
5114 uio.uio_iovcnt = 1;
5115 uio.uio_segflg = UIO_SYSSPACE;
5116 uio.uio_fmode = 0;
5117 uio.uio_extflg = UIO_COPY_CACHED;
5118 uio.uio_loffset = 0;
5119 uio.uio_resid = dlen;
5120 iov.iov_base = dbuf;
5121 iov.iov_len = dlen;
5122 (void) fop_rwlock(vp, V_WRITELOCK_FALSE, NULL);
5123 error = fop_readdir(vp, &uio, cr, &eof, NULL, 0);
5124 fop_rwunlock(vp, V_WRITELOCK_FALSE, NULL);
5126 dbuflen = dlen - uio.uio_resid;
5128 if (error || dbuflen == 0) {
5129 kmem_free(dbuf, dlen);
5130 return (error);
5133 dp = (dirent64_t *)dbuf;
5135 while ((intptr_t)dp < (intptr_t)dbuf + dbuflen) {
5136 if (strcmp(dp->d_name, ".") == 0 ||
5137 strcmp(dp->d_name, "..") == 0 || strcmp(dp->d_name,
5138 VIEW_READWRITE) == 0 || strcmp(dp->d_name,
5139 VIEW_READONLY) == 0) {
5140 dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen);
5141 continue;
5144 *valp = 1;
5145 break;
5147 kmem_free(dbuf, dlen);
5148 return (0);