Merge commit 'ea01a15a654b9e1c7b37d958f4d1911882ed7781'
[unleashed.git] / kernel / fs / nfs / nfs4_rnode.c
blobf2375bab27811174edcfc0e0c905051a48cde49e
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 * All Rights Reserved
32 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
33 * Copyright (c) 2017 by Delphix. All rights reserved.
36 #include <sys/param.h>
37 #include <sys/types.h>
38 #include <sys/systm.h>
39 #include <sys/cred.h>
40 #include <sys/proc.h>
41 #include <sys/user.h>
42 #include <sys/time.h>
43 #include <sys/buf.h>
44 #include <sys/vfs.h>
45 #include <sys/vnode.h>
46 #include <sys/socket.h>
47 #include <sys/uio.h>
48 #include <sys/tiuser.h>
49 #include <sys/swap.h>
50 #include <sys/errno.h>
51 #include <sys/debug.h>
52 #include <sys/kmem.h>
53 #include <sys/kstat.h>
54 #include <sys/cmn_err.h>
55 #include <sys/vtrace.h>
56 #include <sys/session.h>
57 #include <sys/dnlc.h>
58 #include <sys/bitmap.h>
59 #include <sys/acl.h>
60 #include <sys/ddi.h>
61 #include <sys/pathname.h>
62 #include <sys/flock.h>
63 #include <sys/dirent.h>
64 #include <sys/flock.h>
65 #include <sys/callb.h>
66 #include <sys/sdt.h>
68 #include <vm/pvn.h>
70 #include <rpc/types.h>
71 #include <rpc/xdr.h>
72 #include <rpc/auth.h>
73 #include <rpc/rpcsec_gss.h>
74 #include <rpc/clnt.h>
76 #include <nfs/nfs.h>
77 #include <nfs/nfs_clnt.h>
78 #include <nfs/nfs_acl.h>
80 #include <nfs/nfs4.h>
81 #include <nfs/rnode4.h>
82 #include <nfs/nfs4_clnt.h>
85 * The hash queues for the access to active and cached rnodes
86 * are organized as doubly linked lists. A reader/writer lock
87 * for each hash bucket is used to control access and to synchronize
88 * lookups, additions, and deletions from the hash queue.
90 * The rnode freelist is organized as a doubly linked list with
91 * a head pointer. Additions and deletions are synchronized via
92 * a single mutex.
94 * In order to add an rnode to the free list, it must be hashed into
95 * a hash queue and the exclusive lock to the hash queue be held.
96 * If an rnode is not hashed into a hash queue, then it is destroyed
97 * because it represents no valuable information that can be reused
98 * about the file. The exclusive lock to the hash queue must be
99 * held in order to prevent a lookup in the hash queue from finding
100 * the rnode and using it and assuming that the rnode is not on the
101 * freelist. The lookup in the hash queue will have the hash queue
102 * locked, either exclusive or shared.
104 * The vnode reference count for each rnode is not allowed to drop
105 * below 1. This prevents external entities, such as the VM
106 * subsystem, from acquiring references to vnodes already on the
107 * freelist and then trying to place them back on the freelist
108 * when their reference is released. This means that the when an
109 * rnode is looked up in the hash queues, then either the rnode
110 * is removed from the freelist and that reference is transferred to
111 * the new reference or the vnode reference count must be incremented
112 * accordingly. The mutex for the freelist must be held in order to
113 * accurately test to see if the rnode is on the freelist or not.
114 * The hash queue lock might be held shared and it is possible that
115 * two different threads may race to remove the rnode from the
116 * freelist. This race can be resolved by holding the mutex for the
117 * freelist. Please note that the mutex for the freelist does not
118 * need to be held if the rnode is not on the freelist. It can not be
119 * placed on the freelist due to the requirement that the thread
120 * putting the rnode on the freelist must hold the exclusive lock
121 * to the hash queue and the thread doing the lookup in the hash
122 * queue is holding either a shared or exclusive lock to the hash
123 * queue.
125 * The lock ordering is:
127 * hash bucket lock -> vnode lock
128 * hash bucket lock -> freelist lock -> r_statelock
130 r4hashq_t *rtable4;
132 static kmutex_t rp4freelist_lock;
133 static rnode4_t *rp4freelist = NULL;
134 static long rnode4_new = 0;
135 int rtable4size;
136 static int rtable4mask;
137 static struct kmem_cache *rnode4_cache;
138 static int rnode4_hashlen = 4;
140 static void r4inactive(rnode4_t *, cred_t *);
141 static vnode_t *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
142 const struct vnodeops *,
143 int (*)(vnode_t *, page_t *, uoff_t *, size_t *, int,
144 cred_t *),
145 int *, cred_t *);
146 static void rp4_rmfree(rnode4_t *);
147 int nfs4_free_data_reclaim(rnode4_t *);
148 static int nfs4_active_data_reclaim(rnode4_t *);
149 static int nfs4_free_reclaim(void);
150 static int nfs4_active_reclaim(void);
151 static int nfs4_rnode_reclaim(void);
152 static void nfs4_reclaim(void *);
153 static int isrootfh(nfs4_sharedfh_t *, rnode4_t *);
154 static void uninit_rnode4(rnode4_t *);
155 static void destroy_rnode4(rnode4_t *);
156 static void r4_stub_set(rnode4_t *, nfs4_stub_type_t);
158 #ifdef DEBUG
159 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
160 static int nfs4_rnode_debug = 0;
161 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
162 static int nfs4_rnode_nofreelist = 0;
163 /* give messages on colliding shared filehandles */
164 static void r4_dup_check(rnode4_t *, vfs_t *);
165 #endif
168 * If the vnode has pages, run the list and check for any that are
169 * still dangling. We call this routine before putting an rnode on
170 * the free list.
172 static int
173 nfs4_dross_pages(vnode_t *vp)
175 page_t *pp;
177 vmobject_lock(&vp->v_object);
178 for (pp = vmobject_get_head(&vp->v_object);
179 pp != NULL;
180 pp = vmobject_get_next(&vp->v_object, pp)) {
181 if (PP_ISPVN_TAG(pp) &&
182 pp->p_fsdata != C_NOCOMMIT) {
183 vmobject_unlock(&vp->v_object);
184 return (1);
187 vmobject_unlock(&vp->v_object);
189 return (0);
193 * Flush any pages left on this rnode.
195 static void
196 r4flushpages(rnode4_t *rp, cred_t *cr)
198 vnode_t *vp;
199 int error;
202 * Before freeing anything, wait until all asynchronous
203 * activity is done on this rnode. This will allow all
204 * asynchronous read ahead and write behind i/o's to
205 * finish.
207 mutex_enter(&rp->r_statelock);
208 while (rp->r_count > 0)
209 cv_wait(&rp->r_cv, &rp->r_statelock);
210 mutex_exit(&rp->r_statelock);
213 * Flush and invalidate all pages associated with the vnode.
215 vp = RTOV4(rp);
216 if (nfs4_has_pages(vp)) {
217 ASSERT(vp->v_type != VCHR);
218 if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
219 error = fop_putpage(vp, 0, 0, 0, cr, NULL);
220 if (error && (error == ENOSPC || error == EDQUOT)) {
221 mutex_enter(&rp->r_statelock);
222 if (!rp->r_error)
223 rp->r_error = error;
224 mutex_exit(&rp->r_statelock);
227 nfs4_invalidate_pages(vp, 0, cr);
232 * Free the resources associated with an rnode.
234 static void
235 r4inactive(rnode4_t *rp, cred_t *cr)
237 vnode_t *vp;
238 char *contents;
239 int size;
240 vsecattr_t *vsp;
241 vnode_t *xattr;
243 r4flushpages(rp, cr);
245 vp = RTOV4(rp);
248 * Free any held caches which may be
249 * associated with this rnode.
251 mutex_enter(&rp->r_statelock);
252 contents = rp->r_symlink.contents;
253 size = rp->r_symlink.size;
254 rp->r_symlink.contents = NULL;
255 vsp = rp->r_secattr;
256 rp->r_secattr = NULL;
257 xattr = rp->r_xattr_dir;
258 rp->r_xattr_dir = NULL;
259 mutex_exit(&rp->r_statelock);
262 * Free the access cache entries.
264 (void) nfs4_access_purge_rp(rp);
267 * Free the readdir cache entries.
269 nfs4_purge_rddir_cache(vp);
272 * Free the symbolic link cache.
274 if (contents != NULL) {
276 kmem_free((void *)contents, size);
280 * Free any cached ACL.
282 if (vsp != NULL)
283 nfs4_acl_free_cache(vsp);
286 * Release the cached xattr_dir
288 if (xattr != NULL)
289 VN_RELE(xattr);
293 * We have seen a case that the fh passed in is for "." which
294 * should be a VROOT node, however, the fh is different from the
295 * root fh stored in the mntinfo4_t. The invalid fh might be
296 * from a misbehaved server and will panic the client system at
297 * a later time. To avoid the panic, we drop the bad fh, use
298 * the root fh from mntinfo4_t, and print an error message
299 * for attention.
301 nfs4_sharedfh_t *
302 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
303 int *wasbad)
305 char *s;
307 *wasbad = 0;
308 s = fn_name(nm);
309 ASSERT(strcmp(s, "..") != 0);
311 if ((s[0] == '.' && s[1] == '\0') && fh &&
312 !SFH4_SAME(mi->mi_rootfh, fh)) {
313 #ifdef DEBUG
314 nfs4_fhandle_t fhandle;
316 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
317 "Server %s returns a different "
318 "root filehandle for the path %s:",
319 mi->mi_curr_serv->sv_hostname,
320 mi->mi_curr_serv->sv_path);
322 /* print the bad fh */
323 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
324 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
325 fhandle.fh_len);
326 nfs4_printfhandle(&fhandle);
328 /* print mi_rootfh */
329 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
330 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
331 fhandle.fh_len);
332 nfs4_printfhandle(&fhandle);
333 #endif
334 /* use mi_rootfh instead; fh will be rele by the caller */
335 fh = mi->mi_rootfh;
336 *wasbad = 1;
339 kmem_free(s, MAXNAMELEN);
340 return (fh);
343 void
344 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
345 hrtime_t t, cred_t *cr, int index)
347 int is_stub;
348 vattr_t *attr;
350 * Don't add to attrcache if time overflow, but
351 * no need to check because either attr is null or the time
352 * values in it were processed by nfs4_time_ntov(), which checks
353 * for time overflows.
355 attr = garp ? &garp->n4g_va : NULL;
357 if (attr) {
358 if (!newnode) {
359 rw_exit(&rtable4[index].r_lock);
360 #ifdef DEBUG
361 if (vp->v_type != attr->va_type &&
362 vp->v_type != VNON && attr->va_type != VNON) {
363 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
364 "makenfs4node: type (%d) doesn't "
365 "match type of found node at %p (%d)",
366 attr->va_type, (void *)vp, vp->v_type);
368 #endif
369 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
370 } else {
371 rnode4_t *rp = VTOR4(vp);
373 vp->v_type = attr->va_type;
374 vp->v_rdev = attr->va_rdev;
377 * Turn this object into a "stub" object if we
378 * crossed an underlying server fs boundary.
379 * To make this check, during mount we save the
380 * fsid of the server object being mounted.
381 * Here we compare this object's server fsid
382 * with the fsid we saved at mount. If they
383 * are different, we crossed server fs boundary.
385 * The stub type is set (or not) at rnode
386 * creation time and it never changes for life
387 * of the rnode.
389 * This stub will be for a mirror-mount, rather than
390 * a referral (the latter also sets R4SRVSTUB).
392 * The stub type is also set during RO failover,
393 * nfs4_remap_file().
395 * We don't bother with taking r_state_lock to
396 * set the stub type because this is a new rnode
397 * and we're holding the hash bucket r_lock RW_WRITER.
398 * No other thread could have obtained access
399 * to this rnode.
401 is_stub = 0;
402 if (garp->n4g_fsid_valid) {
403 fattr4_fsid ga_fsid = garp->n4g_fsid;
404 servinfo4_t *svp = rp->r_server;
406 rp->r_srv_fsid = ga_fsid;
408 (void) nfs_rw_enter_sig(&svp->sv_lock,
409 RW_READER, 0);
410 if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid))
411 is_stub = 1;
412 nfs_rw_exit(&svp->sv_lock);
415 if (is_stub)
416 r4_stub_mirrormount(rp);
417 else
418 r4_stub_none(rp);
420 /* Can not cache partial attr */
421 if (attr->va_mask == AT_ALL)
422 nfs4_attrcache_noinval(vp, garp, t);
423 else
424 PURGE_ATTRCACHE4(vp);
426 rw_exit(&rtable4[index].r_lock);
428 } else {
429 if (newnode) {
430 PURGE_ATTRCACHE4(vp);
432 rw_exit(&rtable4[index].r_lock);
437 * Find or create an rnode based primarily on filehandle. To be
438 * used when dvp (vnode for parent directory) is not available;
439 * otherwise, makenfs4node() should be used.
441 * The nfs4_fname_t argument *npp is consumed and nulled out.
444 vnode_t *
445 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
446 nfs4_fname_t **npp, nfs4_ga_res_t *garp,
447 mntinfo4_t *mi, cred_t *cr, hrtime_t t)
449 vfs_t *vfsp = mi->mi_vfsp;
450 int newnode = 0;
451 vnode_t *vp;
452 rnode4_t *rp;
453 svnode_t *svp;
454 nfs4_fname_t *name, *svpname;
455 int index;
457 ASSERT(npp && *npp);
458 name = *npp;
459 *npp = NULL;
461 index = rtable4hash(sfh);
462 rw_enter(&rtable4[index].r_lock, RW_READER);
464 vp = make_rnode4(sfh, &rtable4[index], vfsp,
465 &nfs4_vnodeops, nfs4_putapage, &newnode, cr);
467 svp = VTOSV(vp);
468 rp = VTOR4(vp);
469 if (newnode) {
470 svp->sv_forw = svp->sv_back = svp;
471 svp->sv_name = name;
472 if (psfh != NULL)
473 sfh4_hold(psfh);
474 svp->sv_dfh = psfh;
475 } else {
477 * It is possible that due to a server
478 * side rename fnames have changed.
479 * update the fname here.
481 mutex_enter(&rp->r_svlock);
482 svpname = svp->sv_name;
483 if (svp->sv_name != name) {
484 svp->sv_name = name;
485 mutex_exit(&rp->r_svlock);
486 fn_rele(&svpname);
487 } else {
488 mutex_exit(&rp->r_svlock);
489 fn_rele(&name);
493 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
494 r4_do_attrcache(vp, garp, newnode, t, cr, index);
495 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
497 return (vp);
501 * Find or create a vnode for the given filehandle, filesystem, parent, and
502 * name. The reference to nm is consumed, so the caller must first do an
503 * fn_hold() if it wants to continue using nm after this call.
505 vnode_t *
506 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
507 hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
509 vnode_t *vp;
510 int newnode;
511 int index;
512 mntinfo4_t *mi = VFTOMI4(vfsp);
513 int had_badfh = 0;
514 rnode4_t *rp;
516 ASSERT(dvp != NULL);
518 fh = badrootfh_check(fh, nm, mi, &had_badfh);
520 index = rtable4hash(fh);
521 rw_enter(&rtable4[index].r_lock, RW_READER);
524 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
526 vp = make_rnode4(fh, &rtable4[index], vfsp, &nfs4_vnodeops,
527 nfs4_putapage, &newnode, cr);
529 rp = VTOR4(vp);
530 sv_activate(&vp, dvp, &nm, newnode);
531 if (dvp->v_flag & V_XATTRDIR) {
532 mutex_enter(&rp->r_statelock);
533 rp->r_flags |= R4ISXATTR;
534 mutex_exit(&rp->r_statelock);
537 /* if getting a bad file handle, do not cache the attributes. */
538 if (had_badfh) {
539 rw_exit(&rtable4[index].r_lock);
540 return (vp);
543 ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
544 r4_do_attrcache(vp, garp, newnode, t, cr, index);
545 ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
547 return (vp);
551 * Hash on address of filehandle object.
552 * XXX totally untuned.
556 rtable4hash(nfs4_sharedfh_t *fh)
558 return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
562 * Find or create the vnode for the given filehandle and filesystem.
563 * *newnode is set to zero if the vnode already existed; non-zero if it had
564 * to be created.
566 * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
569 static vnode_t *
570 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
571 const struct vnodeops *vops,
572 int (*putapage)(vnode_t *, page_t *, uoff_t *, size_t *, int, cred_t *),
573 int *newnode, cred_t *cr)
575 rnode4_t *rp;
576 rnode4_t *trp;
577 vnode_t *vp;
578 mntinfo4_t *mi;
580 ASSERT(RW_READ_HELD(&rhtp->r_lock));
582 mi = VFTOMI4(vfsp);
584 start:
585 if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
586 vp = RTOV4(rp);
587 *newnode = 0;
588 return (vp);
590 rw_exit(&rhtp->r_lock);
592 mutex_enter(&rp4freelist_lock);
594 if (rp4freelist != NULL && rnode4_new >= nrnode) {
595 rp = rp4freelist;
596 rp4_rmfree(rp);
597 mutex_exit(&rp4freelist_lock);
599 vp = RTOV4(rp);
601 if (rp->r_flags & R4HASHED) {
602 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
603 mutex_enter(&vp->v_lock);
604 if (vp->v_count > 1) {
605 VN_RELE_LOCKED(vp);
606 mutex_exit(&vp->v_lock);
607 rw_exit(&rp->r_hashq->r_lock);
608 rw_enter(&rhtp->r_lock, RW_READER);
609 goto start;
611 mutex_exit(&vp->v_lock);
612 rp4_rmhash_locked(rp);
613 rw_exit(&rp->r_hashq->r_lock);
616 r4inactive(rp, cr);
618 mutex_enter(&vp->v_lock);
619 if (vp->v_count > 1) {
620 VN_RELE_LOCKED(vp);
621 mutex_exit(&vp->v_lock);
622 rw_enter(&rhtp->r_lock, RW_READER);
623 goto start;
625 mutex_exit(&vp->v_lock);
626 vn_invalid(vp);
629 * destroy old locks before bzero'ing and
630 * recreating the locks below.
632 uninit_rnode4(rp);
635 * Make sure that if rnode is recycled then
636 * VFS count is decremented properly before
637 * reuse.
639 VFS_RELE(vp->v_vfsp);
640 vn_reinit(vp);
641 } else {
642 vnode_t *new_vp;
644 mutex_exit(&rp4freelist_lock);
646 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
647 new_vp = vn_alloc(KM_SLEEP);
649 atomic_inc_ulong((ulong_t *)&rnode4_new);
650 #ifdef DEBUG
651 clstat4_debug.nrnode.value.ui64++;
652 #endif
653 vp = new_vp;
656 bzero(rp, sizeof (*rp));
657 rp->r_vnode = vp;
658 nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
659 nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
660 mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
661 mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
662 mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
663 mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
664 rp->created_v4 = 0;
665 list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
666 offsetof(nfs4_open_stream_t, os_node));
667 rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
668 rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
669 cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
670 cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
671 rp->r_flags = R4READDIRWATTR;
672 rp->r_fh = fh;
673 rp->r_hashq = rhtp;
674 sfh4_hold(rp->r_fh);
675 rp->r_server = mi->mi_curr_serv;
676 rp->r_deleg_type = OPEN_DELEGATE_NONE;
677 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
678 nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
680 rddir4_cache_create(rp);
681 rp->r_putapage = putapage;
682 vn_setops(vp, vops);
683 vp->v_data = (caddr_t)rp;
684 vp->v_vfsp = vfsp;
685 VFS_HOLD(vfsp);
686 vp->v_type = VNON;
687 vp->v_flag |= VMODSORT;
688 if (isrootfh(fh, rp))
689 vp->v_flag = VROOT;
690 vn_exists(vp);
693 * There is a race condition if someone else
694 * alloc's the rnode while no locks are held, so we
695 * check again and recover if found.
697 rw_enter(&rhtp->r_lock, RW_WRITER);
698 if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
699 vp = RTOV4(trp);
700 *newnode = 0;
701 rw_exit(&rhtp->r_lock);
702 rp4_addfree(rp, cr);
703 rw_enter(&rhtp->r_lock, RW_READER);
704 return (vp);
706 rp4_addhash(rp);
707 *newnode = 1;
708 return (vp);
711 static void
712 uninit_rnode4(rnode4_t *rp)
714 vnode_t *vp = RTOV4(rp);
716 ASSERT(rp != NULL);
717 ASSERT(vp != NULL);
718 ASSERT(vp->v_count == 1);
719 ASSERT(rp->r_count == 0);
720 ASSERT(rp->r_mapcnt == 0);
721 if (rp->r_flags & R4LODANGLERS) {
722 nfs4_flush_lock_owners(rp);
724 ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
725 ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
726 ASSERT(!(rp->r_flags & R4HASHED));
727 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
728 nfs4_clear_open_streams(rp);
729 list_destroy(&rp->r_open_streams);
732 * Destroy the rddir cache first since we need to grab the r_statelock.
734 mutex_enter(&rp->r_statelock);
735 rddir4_cache_destroy(rp);
736 mutex_exit(&rp->r_statelock);
737 sv_uninit(&rp->r_svnode);
738 sfh4_rele(&rp->r_fh);
739 nfs_rw_destroy(&rp->r_rwlock);
740 nfs_rw_destroy(&rp->r_lkserlock);
741 mutex_destroy(&rp->r_statelock);
742 mutex_destroy(&rp->r_statev4_lock);
743 mutex_destroy(&rp->r_os_lock);
744 cv_destroy(&rp->r_cv);
745 cv_destroy(&rp->r_commit.c_cv);
746 nfs_rw_destroy(&rp->r_deleg_recall_lock);
747 if (rp->r_flags & R4DELMAPLIST)
748 list_destroy(&rp->r_indelmap);
752 * Put an rnode on the free list.
754 * Rnodes which were allocated above and beyond the normal limit
755 * are immediately freed.
757 void
758 rp4_addfree(rnode4_t *rp, cred_t *cr)
760 vnode_t *vp;
761 vnode_t *xattr;
762 struct vfs *vfsp;
764 vp = RTOV4(rp);
765 ASSERT(vp->v_count >= 1);
766 ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
769 * If we have too many rnodes allocated and there are no
770 * references to this rnode, or if the rnode is no longer
771 * accessible by it does not reside in the hash queues,
772 * or if an i/o error occurred while writing to the file,
773 * then just free it instead of putting it on the rnode
774 * freelist.
776 vfsp = vp->v_vfsp;
777 if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
778 #ifdef DEBUG
779 (nfs4_rnode_nofreelist != 0) ||
780 #endif
781 rp->r_error || (rp->r_flags & R4RECOVERR) ||
782 (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
783 if (rp->r_flags & R4HASHED) {
784 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
785 mutex_enter(&vp->v_lock);
786 if (vp->v_count > 1) {
787 VN_RELE_LOCKED(vp);
788 mutex_exit(&vp->v_lock);
789 rw_exit(&rp->r_hashq->r_lock);
790 return;
792 mutex_exit(&vp->v_lock);
793 rp4_rmhash_locked(rp);
794 rw_exit(&rp->r_hashq->r_lock);
798 * Make sure we don't have a delegation on this rnode
799 * before destroying it.
801 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
802 (void) nfs4delegreturn(rp,
803 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
806 r4inactive(rp, cr);
809 * Recheck the vnode reference count. We need to
810 * make sure that another reference has not been
811 * acquired while we were not holding v_lock. The
812 * rnode is not in the rnode hash queues; one
813 * way for a reference to have been acquired
814 * is for a fop_putpage because the rnode was marked
815 * with R4DIRTY or for a modified page. This
816 * reference may have been acquired before our call
817 * to r4inactive. The i/o may have been completed,
818 * thus allowing r4inactive to complete, but the
819 * reference to the vnode may not have been released
820 * yet. In any case, the rnode can not be destroyed
821 * until the other references to this vnode have been
822 * released. The other references will take care of
823 * either destroying the rnode or placing it on the
824 * rnode freelist. If there are no other references,
825 * then the rnode may be safely destroyed.
827 mutex_enter(&vp->v_lock);
828 if (vp->v_count > 1) {
829 VN_RELE_LOCKED(vp);
830 mutex_exit(&vp->v_lock);
831 return;
833 mutex_exit(&vp->v_lock);
835 destroy_rnode4(rp);
836 return;
840 * Lock the hash queue and then recheck the reference count
841 * to ensure that no other threads have acquired a reference
842 * to indicate that the rnode should not be placed on the
843 * freelist. If another reference has been acquired, then
844 * just release this one and let the other thread complete
845 * the processing of adding this rnode to the freelist.
847 again:
848 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
850 mutex_enter(&vp->v_lock);
851 if (vp->v_count > 1) {
852 VN_RELE_LOCKED(vp);
853 mutex_exit(&vp->v_lock);
854 rw_exit(&rp->r_hashq->r_lock);
855 return;
857 mutex_exit(&vp->v_lock);
860 * Make sure we don't put an rnode with a delegation
861 * on the free list.
863 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
864 rw_exit(&rp->r_hashq->r_lock);
865 (void) nfs4delegreturn(rp,
866 NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
867 goto again;
871 * Now that we have the hash queue lock, and we know there
872 * are not anymore references on the vnode, check to make
873 * sure there aren't any open streams still on the rnode.
874 * If so, drop the hash queue lock, remove the open streams,
875 * and recheck the v_count.
877 mutex_enter(&rp->r_os_lock);
878 if (list_head(&rp->r_open_streams) != NULL) {
879 mutex_exit(&rp->r_os_lock);
880 rw_exit(&rp->r_hashq->r_lock);
881 if (nfs_zone() != VTOMI4(vp)->mi_zone)
882 nfs4_clear_open_streams(rp);
883 else
884 (void) nfs4close_all(vp, cr);
885 goto again;
887 mutex_exit(&rp->r_os_lock);
890 * Before we put it on the freelist, make sure there are no pages.
891 * If there are, flush and commit of all of the dirty and
892 * uncommitted pages, assuming the file system isn't read only.
894 if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
895 rw_exit(&rp->r_hashq->r_lock);
896 r4flushpages(rp, cr);
897 goto again;
901 * Before we put it on the freelist, make sure there is no
902 * active xattr directory cached, the freelist will not
903 * have its entries r4inactive'd if there is still an active
904 * rnode, thus nothing in the freelist can hold another
905 * rnode active.
907 xattr = rp->r_xattr_dir;
908 rp->r_xattr_dir = NULL;
911 * If there is no cached data or metadata for this file, then
912 * put the rnode on the front of the freelist so that it will
913 * be reused before other rnodes which may have cached data or
914 * metadata associated with them.
916 mutex_enter(&rp4freelist_lock);
917 if (rp4freelist == NULL) {
918 rp->r_freef = rp;
919 rp->r_freeb = rp;
920 rp4freelist = rp;
921 } else {
922 rp->r_freef = rp4freelist;
923 rp->r_freeb = rp4freelist->r_freeb;
924 rp4freelist->r_freeb->r_freef = rp;
925 rp4freelist->r_freeb = rp;
926 if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
927 rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
928 rp4freelist = rp;
930 mutex_exit(&rp4freelist_lock);
932 rw_exit(&rp->r_hashq->r_lock);
934 if (xattr)
935 VN_RELE(xattr);
939 * Remove an rnode from the free list.
941 * The caller must be holding rp4freelist_lock and the rnode
942 * must be on the freelist.
944 static void
945 rp4_rmfree(rnode4_t *rp)
948 ASSERT(MUTEX_HELD(&rp4freelist_lock));
949 ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
951 if (rp == rp4freelist) {
952 rp4freelist = rp->r_freef;
953 if (rp == rp4freelist)
954 rp4freelist = NULL;
956 rp->r_freeb->r_freef = rp->r_freef;
957 rp->r_freef->r_freeb = rp->r_freeb;
959 rp->r_freef = rp->r_freeb = NULL;
963 * Put a rnode in the hash table.
965 * The caller must be holding the exclusive hash queue lock
967 void
968 rp4_addhash(rnode4_t *rp)
970 mntinfo4_t *mi;
972 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
973 ASSERT(!(rp->r_flags & R4HASHED));
975 #ifdef DEBUG
976 r4_dup_check(rp, RTOV4(rp)->v_vfsp);
977 #endif
979 rp->r_hashf = rp->r_hashq->r_hashf;
980 rp->r_hashq->r_hashf = rp;
981 rp->r_hashb = (rnode4_t *)rp->r_hashq;
982 rp->r_hashf->r_hashb = rp;
984 mutex_enter(&rp->r_statelock);
985 rp->r_flags |= R4HASHED;
986 mutex_exit(&rp->r_statelock);
988 mi = VTOMI4(RTOV4(rp));
989 mutex_enter(&mi->mi_rnodes_lock);
990 list_insert_tail(&mi->mi_rnodes, rp);
991 mutex_exit(&mi->mi_rnodes_lock);
995 * Remove a rnode from the hash table.
997 * The caller must be holding the hash queue lock.
999 void
1000 rp4_rmhash_locked(rnode4_t *rp)
1002 mntinfo4_t *mi;
1004 ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
1005 ASSERT(rp->r_flags & R4HASHED);
1007 rp->r_hashb->r_hashf = rp->r_hashf;
1008 rp->r_hashf->r_hashb = rp->r_hashb;
1010 mutex_enter(&rp->r_statelock);
1011 rp->r_flags &= ~R4HASHED;
1012 mutex_exit(&rp->r_statelock);
1014 mi = VTOMI4(RTOV4(rp));
1015 mutex_enter(&mi->mi_rnodes_lock);
1016 if (list_link_active(&rp->r_mi_link))
1017 list_remove(&mi->mi_rnodes, rp);
1018 mutex_exit(&mi->mi_rnodes_lock);
1022 * Remove a rnode from the hash table.
1024 * The caller must not be holding the hash queue lock.
1026 void
1027 rp4_rmhash(rnode4_t *rp)
1029 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1030 rp4_rmhash_locked(rp);
1031 rw_exit(&rp->r_hashq->r_lock);
1035 * Lookup a rnode by fhandle. Ignores rnodes that had failed recovery.
1036 * Returns NULL if no match. If an rnode is returned, the reference count
1037 * on the master vnode is incremented.
1039 * The caller must be holding the hash queue lock, either shared or exclusive.
1041 rnode4_t *
1042 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1044 rnode4_t *rp;
1045 vnode_t *vp;
1047 ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1049 for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1050 vp = RTOV4(rp);
1051 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1053 mutex_enter(&rp->r_statelock);
1054 if (rp->r_flags & R4RECOVERR) {
1055 mutex_exit(&rp->r_statelock);
1056 continue;
1058 mutex_exit(&rp->r_statelock);
1059 #ifdef DEBUG
1060 r4_dup_check(rp, vfsp);
1061 #endif
1062 if (rp->r_freef != NULL) {
1063 mutex_enter(&rp4freelist_lock);
1065 * If the rnode is on the freelist,
1066 * then remove it and use that reference
1067 * as the new reference. Otherwise,
1068 * need to increment the reference count.
1070 if (rp->r_freef != NULL) {
1071 rp4_rmfree(rp);
1072 mutex_exit(&rp4freelist_lock);
1073 } else {
1074 mutex_exit(&rp4freelist_lock);
1075 VN_HOLD(vp);
1077 } else
1078 VN_HOLD(vp);
1081 * if root vnode, set v_flag to indicate that
1083 if (isrootfh(fh, rp)) {
1084 if (!(vp->v_flag & VROOT)) {
1085 mutex_enter(&vp->v_lock);
1086 vp->v_flag |= VROOT;
1087 mutex_exit(&vp->v_lock);
1090 return (rp);
1093 return (NULL);
1097 * Lookup an rnode by fhandle. Just a wrapper for r4find()
1098 * that assumes the caller hasn't already got the lock
1099 * on the hash bucket.
1101 rnode4_t *
1102 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1104 rnode4_t *rp;
1105 int index;
1107 index = rtable4hash(fh);
1108 rw_enter(&rtable4[index].r_lock, RW_READER);
1109 rp = r4find(&rtable4[index], fh, vfsp);
1110 rw_exit(&rtable4[index].r_lock);
1112 return (rp);
1116 * Return 1 if there is an active vnode belonging to this vfs in the
1117 * rtable4 cache.
1119 * Several of these checks are done without holding the usual
1120 * locks. This is safe because destroy_rtable4(), rp4_addfree(),
1121 * etc. will redo the necessary checks before actually destroying
1122 * any rnodes.
1125 check_rtable4(struct vfs *vfsp)
1127 rnode4_t *rp;
1128 vnode_t *vp;
1129 mntinfo4_t *mi;
1131 ASSERT(vfsp != NULL);
1132 mi = VFTOMI4(vfsp);
1134 mutex_enter(&mi->mi_rnodes_lock);
1135 for (rp = list_head(&mi->mi_rnodes); rp != NULL;
1136 rp = list_next(&mi->mi_rnodes, rp)) {
1137 vp = RTOV4(rp);
1139 if (rp->r_freef == NULL ||
1140 (nfs4_has_pages(vp) && (rp->r_flags & R4DIRTY)) ||
1141 rp->r_count > 0) {
1142 mutex_exit(&mi->mi_rnodes_lock);
1143 return (1);
1146 mutex_exit(&mi->mi_rnodes_lock);
1148 return (0);
1152 * Destroy inactive vnodes from the hash queues which
1153 * belong to this vfs. All of the vnodes should be inactive.
1154 * It is essential that we destroy all rnodes in case of
1155 * forced unmount as well as in normal unmount case.
1158 void
1159 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1161 rnode4_t *rp;
1162 mntinfo4_t *mi;
1164 ASSERT(vfsp != NULL);
1166 mi = VFTOMI4(vfsp);
1168 mutex_enter(&rp4freelist_lock);
1169 mutex_enter(&mi->mi_rnodes_lock);
1170 while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
1172 * If the rnode is no longer on the freelist it is not
1173 * ours and it will be handled by some other thread, so
1174 * skip it.
1176 if (rp->r_freef == NULL)
1177 continue;
1178 mutex_exit(&mi->mi_rnodes_lock);
1180 rp4_rmfree(rp);
1181 mutex_exit(&rp4freelist_lock);
1183 rp4_rmhash(rp);
1186 * This call to rp4_addfree will end up destroying the
1187 * rnode, but in a safe way with the appropriate set
1188 * of checks done.
1190 rp4_addfree(rp, cr);
1192 mutex_enter(&rp4freelist_lock);
1193 mutex_enter(&mi->mi_rnodes_lock);
1195 mutex_exit(&mi->mi_rnodes_lock);
1196 mutex_exit(&rp4freelist_lock);
1200 * This routine destroys all the resources of an rnode
1201 * and finally the rnode itself.
1203 static void
1204 destroy_rnode4(rnode4_t *rp)
1206 vnode_t *vp;
1207 vfs_t *vfsp;
1209 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1211 vp = RTOV4(rp);
1212 vfsp = vp->v_vfsp;
1214 uninit_rnode4(rp);
1215 atomic_dec_ulong((ulong_t *)&rnode4_new);
1216 #ifdef DEBUG
1217 clstat4_debug.nrnode.value.ui64--;
1218 #endif
1219 kmem_cache_free(rnode4_cache, rp);
1220 vn_invalid(vp);
1221 vn_free(vp);
1222 VFS_RELE(vfsp);
1226 * Invalidate the attributes on all rnodes forcing the next getattr
1227 * to go over the wire. Used to flush stale uid and gid mappings.
1228 * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1230 void
1231 nfs4_rnode_invalidate(struct vfs *vfsp)
1233 int index;
1234 rnode4_t *rp;
1235 vnode_t *vp;
1238 * Walk the hash queues looking for rnodes.
1240 for (index = 0; index < rtable4size; index++) {
1241 rw_enter(&rtable4[index].r_lock, RW_READER);
1242 for (rp = rtable4[index].r_hashf;
1243 rp != (rnode4_t *)(&rtable4[index]);
1244 rp = rp->r_hashf) {
1245 vp = RTOV4(rp);
1246 if (vfsp != NULL && vp->v_vfsp != vfsp)
1247 continue;
1249 if (!mutex_tryenter(&rp->r_statelock))
1250 continue;
1253 * Expire the attributes by resetting the change
1254 * and attr timeout.
1256 rp->r_change = 0;
1257 PURGE_ATTRCACHE4_LOCKED(rp);
1258 mutex_exit(&rp->r_statelock);
1260 rw_exit(&rtable4[index].r_lock);
1265 * Flush all vnodes in this (or every) vfs.
1266 * Used by nfs_sync and by nfs_unmount.
1268 void
1269 r4flush(struct vfs *vfsp, cred_t *cr)
1271 int index;
1272 rnode4_t *rp;
1273 vnode_t *vp, **vplist;
1274 long num, cnt;
1277 * Check to see whether there is anything to do.
1279 num = rnode4_new;
1280 if (num == 0)
1281 return;
1284 * Allocate a slot for all currently active rnodes on the
1285 * supposition that they all may need flushing.
1287 vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1288 cnt = 0;
1291 * If the vfs is known we can do fast path by iterating all rnodes that
1292 * belongs to this vfs. This is much faster than the traditional way
1293 * of iterating rtable4 (below) in a case there is a lot of rnodes that
1294 * does not belong to our vfs.
1296 if (vfsp != NULL) {
1297 mntinfo4_t *mi = VFTOMI4(vfsp);
1299 mutex_enter(&mi->mi_rnodes_lock);
1300 for (rp = list_head(&mi->mi_rnodes); rp != NULL;
1301 rp = list_next(&mi->mi_rnodes, rp)) {
1302 vp = RTOV4(rp);
1304 * Don't bother sync'ing a vp if it
1305 * is part of virtual swap device or
1306 * if VFS is read-only
1308 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1309 continue;
1311 * If the vnode has pages and is marked as either dirty
1312 * or mmap'd, hold and add this vnode to the list of
1313 * vnodes to flush.
1315 ASSERT(vp->v_vfsp == vfsp);
1316 if (nfs4_has_pages(vp) &&
1317 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1318 VN_HOLD(vp);
1319 vplist[cnt++] = vp;
1320 if (cnt == num) {
1322 * The vplist is full because there is
1323 * too many rnodes. We are done for
1324 * now.
1326 break;
1330 mutex_exit(&mi->mi_rnodes_lock);
1332 goto done;
1335 ASSERT(vfsp == NULL);
1338 * Walk the hash queues looking for rnodes with page
1339 * lists associated with them. Make a list of these
1340 * files.
1342 for (index = 0; index < rtable4size; index++) {
1343 rw_enter(&rtable4[index].r_lock, RW_READER);
1344 for (rp = rtable4[index].r_hashf;
1345 rp != (rnode4_t *)(&rtable4[index]);
1346 rp = rp->r_hashf) {
1347 vp = RTOV4(rp);
1349 * Don't bother sync'ing a vp if it
1350 * is part of virtual swap device or
1351 * if VFS is read-only
1353 if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1354 continue;
1356 * If the vnode has pages and is marked as either dirty
1357 * or mmap'd, hold and add this vnode to the list of
1358 * vnodes to flush.
1360 if (nfs4_has_pages(vp) &&
1361 ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1362 VN_HOLD(vp);
1363 vplist[cnt++] = vp;
1364 if (cnt == num) {
1365 rw_exit(&rtable4[index].r_lock);
1367 * The vplist is full because there is
1368 * too many rnodes. We are done for
1369 * now.
1371 goto done;
1375 rw_exit(&rtable4[index].r_lock);
1378 done:
1381 * Flush and release all of the files on the list.
1383 while (cnt-- > 0) {
1384 vp = vplist[cnt];
1385 (void) fop_putpage(vp, 0, 0, B_ASYNC, cr, NULL);
1386 VN_RELE(vp);
1390 * Free the space allocated to hold the list.
1392 kmem_free(vplist, num * sizeof (*vplist));
1396 nfs4_free_data_reclaim(rnode4_t *rp)
1398 char *contents;
1399 vnode_t *xattr;
1400 int size;
1401 vsecattr_t *vsp;
1402 int freed;
1403 bool_t rdc = FALSE;
1406 * Free any held caches which may
1407 * be associated with this rnode.
1409 mutex_enter(&rp->r_statelock);
1410 if (rp->r_dir != NULL)
1411 rdc = TRUE;
1412 contents = rp->r_symlink.contents;
1413 size = rp->r_symlink.size;
1414 rp->r_symlink.contents = NULL;
1415 vsp = rp->r_secattr;
1416 rp->r_secattr = NULL;
1417 xattr = rp->r_xattr_dir;
1418 rp->r_xattr_dir = NULL;
1419 mutex_exit(&rp->r_statelock);
1422 * Free the access cache entries.
1424 freed = nfs4_access_purge_rp(rp);
1426 if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1427 return (freed);
1430 * Free the readdir cache entries, incompletely if we can't block.
1432 nfs4_purge_rddir_cache(RTOV4(rp));
1435 * Free the symbolic link cache.
1437 if (contents != NULL) {
1439 kmem_free((void *)contents, size);
1443 * Free any cached ACL.
1445 if (vsp != NULL)
1446 nfs4_acl_free_cache(vsp);
1449 * Release the xattr directory vnode
1451 if (xattr != NULL)
1452 VN_RELE(xattr);
1454 return (1);
1457 static int
1458 nfs4_active_data_reclaim(rnode4_t *rp)
1460 char *contents;
1461 vnode_t *xattr = NULL;
1462 int size;
1463 vsecattr_t *vsp;
1464 int freed;
1465 bool_t rdc = FALSE;
1468 * Free any held credentials and caches which
1469 * may be associated with this rnode.
1471 if (!mutex_tryenter(&rp->r_statelock))
1472 return (0);
1473 contents = rp->r_symlink.contents;
1474 size = rp->r_symlink.size;
1475 rp->r_symlink.contents = NULL;
1476 vsp = rp->r_secattr;
1477 rp->r_secattr = NULL;
1478 if (rp->r_dir != NULL)
1479 rdc = TRUE;
1481 * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed
1482 * on the same r_hashq queue. We are not mandated to free all caches.
1483 * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the
1484 * rnode 'rp' is freed or put on the free list.
1486 * We will retain NFS4_XATTR_DIR_NOTSUPP because:
1487 * - it has no associated rnode4_t (its v_data is NULL),
1488 * - it is preallocated statically and will never go away,
1489 * so we cannot save anything by releasing it.
1491 if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP &&
1492 VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) {
1493 xattr = rp->r_xattr_dir;
1494 rp->r_xattr_dir = NULL;
1496 mutex_exit(&rp->r_statelock);
1499 * Free the access cache entries.
1501 freed = nfs4_access_purge_rp(rp);
1503 if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1504 return (freed);
1507 * Free the symbolic link cache.
1509 if (contents != NULL) {
1511 kmem_free((void *)contents, size);
1515 * Free any cached ACL.
1517 if (vsp != NULL)
1518 nfs4_acl_free_cache(vsp);
1520 nfs4_purge_rddir_cache(RTOV4(rp));
1523 * Release the xattr directory vnode
1525 if (xattr != NULL)
1526 VN_RELE(xattr);
1528 return (1);
1531 static int
1532 nfs4_free_reclaim(void)
1534 int freed;
1535 rnode4_t *rp;
1537 #ifdef DEBUG
1538 clstat4_debug.f_reclaim.value.ui64++;
1539 #endif
1540 freed = 0;
1541 mutex_enter(&rp4freelist_lock);
1542 rp = rp4freelist;
1543 if (rp != NULL) {
1544 do {
1545 if (nfs4_free_data_reclaim(rp))
1546 freed = 1;
1547 } while ((rp = rp->r_freef) != rp4freelist);
1549 mutex_exit(&rp4freelist_lock);
1550 return (freed);
1553 static int
1554 nfs4_active_reclaim(void)
1556 int freed;
1557 int index;
1558 rnode4_t *rp;
1560 #ifdef DEBUG
1561 clstat4_debug.a_reclaim.value.ui64++;
1562 #endif
1563 freed = 0;
1564 for (index = 0; index < rtable4size; index++) {
1565 rw_enter(&rtable4[index].r_lock, RW_READER);
1566 for (rp = rtable4[index].r_hashf;
1567 rp != (rnode4_t *)(&rtable4[index]);
1568 rp = rp->r_hashf) {
1569 if (nfs4_active_data_reclaim(rp))
1570 freed = 1;
1572 rw_exit(&rtable4[index].r_lock);
1574 return (freed);
1577 static int
1578 nfs4_rnode_reclaim(void)
1580 int freed;
1581 rnode4_t *rp;
1582 vnode_t *vp;
1584 #ifdef DEBUG
1585 clstat4_debug.r_reclaim.value.ui64++;
1586 #endif
1587 freed = 0;
1588 mutex_enter(&rp4freelist_lock);
1589 while ((rp = rp4freelist) != NULL) {
1590 rp4_rmfree(rp);
1591 mutex_exit(&rp4freelist_lock);
1592 if (rp->r_flags & R4HASHED) {
1593 vp = RTOV4(rp);
1594 rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1595 mutex_enter(&vp->v_lock);
1596 if (vp->v_count > 1) {
1597 VN_RELE_LOCKED(vp);
1598 mutex_exit(&vp->v_lock);
1599 rw_exit(&rp->r_hashq->r_lock);
1600 mutex_enter(&rp4freelist_lock);
1601 continue;
1603 mutex_exit(&vp->v_lock);
1604 rp4_rmhash_locked(rp);
1605 rw_exit(&rp->r_hashq->r_lock);
1608 * This call to rp_addfree will end up destroying the
1609 * rnode, but in a safe way with the appropriate set
1610 * of checks done.
1612 rp4_addfree(rp, CRED());
1613 mutex_enter(&rp4freelist_lock);
1615 mutex_exit(&rp4freelist_lock);
1616 return (freed);
1619 /*ARGSUSED*/
1620 static void
1621 nfs4_reclaim(void *cdrarg)
1623 #ifdef DEBUG
1624 clstat4_debug.reclaim.value.ui64++;
1625 #endif
1626 if (nfs4_free_reclaim())
1627 return;
1629 if (nfs4_active_reclaim())
1630 return;
1632 (void) nfs4_rnode_reclaim();
1636 * Returns the clientid4 to use for the given mntinfo4. Note that the
1637 * clientid can change if the caller drops mi_recovlock.
1640 clientid4
1641 mi2clientid(mntinfo4_t *mi)
1643 nfs4_server_t *sp;
1644 clientid4 clientid = 0;
1646 /* this locks down sp if it is found */
1647 sp = find_nfs4_server(mi);
1648 if (sp != NULL) {
1649 clientid = sp->clientid;
1650 mutex_exit(&sp->s_lock);
1651 nfs4_server_rele(sp);
1653 return (clientid);
1657 * Return the current lease time for the server associated with the given
1658 * file. Note that the lease time could change immediately after this
1659 * call.
1662 time_t
1663 r2lease_time(rnode4_t *rp)
1665 nfs4_server_t *sp;
1666 time_t lease_time;
1667 mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1669 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1671 /* this locks down sp if it is found */
1672 sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1674 if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1675 if (sp != NULL) {
1676 mutex_exit(&sp->s_lock);
1677 nfs4_server_rele(sp);
1679 nfs_rw_exit(&mi->mi_recovlock);
1680 return (1); /* 1 second */
1683 ASSERT(sp != NULL);
1685 lease_time = sp->s_lease_time;
1687 mutex_exit(&sp->s_lock);
1688 nfs4_server_rele(sp);
1689 nfs_rw_exit(&mi->mi_recovlock);
1691 return (lease_time);
1695 * Return a list with information about all the known open instances for
1696 * a filesystem. The caller must call r4releopenlist() when done with the
1697 * list.
1699 * We are safe at looking at os_valid and os_pending_close across dropping
1700 * the 'os_sync_lock' to count up the number of open streams and then
1701 * allocate memory for the osp list due to:
1702 * -Looking at os_pending_close is safe since this routine is
1703 * only called via recovery, and os_pending_close can only be set via
1704 * a non-recovery operation (which are all blocked when recovery
1705 * is active).
1707 * -Examining os_valid is safe since non-recovery operations, which
1708 * could potentially switch os_valid to 0, are blocked (via
1709 * nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1710 * (which means we are the only recovery thread potentially acting
1711 * on this open stream).
1714 nfs4_opinst_t *
1715 r4mkopenlist(mntinfo4_t *mi)
1717 nfs4_opinst_t *reopenlist, *rep;
1718 rnode4_t *rp;
1719 vnode_t *vp;
1720 vfs_t *vfsp = mi->mi_vfsp;
1721 int numosp;
1722 nfs4_open_stream_t *osp;
1723 int index;
1724 open_delegation_type4 dtype;
1725 int hold_vnode;
1727 reopenlist = NULL;
1729 for (index = 0; index < rtable4size; index++) {
1730 rw_enter(&rtable4[index].r_lock, RW_READER);
1731 for (rp = rtable4[index].r_hashf;
1732 rp != (rnode4_t *)(&rtable4[index]);
1733 rp = rp->r_hashf) {
1735 vp = RTOV4(rp);
1736 if (vp->v_vfsp != vfsp)
1737 continue;
1738 hold_vnode = 0;
1740 mutex_enter(&rp->r_os_lock);
1742 /* Count the number of valid open_streams of the file */
1743 numosp = 0;
1744 for (osp = list_head(&rp->r_open_streams); osp != NULL;
1745 osp = list_next(&rp->r_open_streams, osp)) {
1746 mutex_enter(&osp->os_sync_lock);
1747 if (osp->os_valid && !osp->os_pending_close)
1748 numosp++;
1749 mutex_exit(&osp->os_sync_lock);
1752 /* Fill in the valid open streams per vp */
1753 if (numosp > 0) {
1754 int j;
1756 hold_vnode = 1;
1759 * Add a new open instance to the list
1761 rep = kmem_zalloc(sizeof (*reopenlist),
1762 KM_SLEEP);
1763 rep->re_next = reopenlist;
1764 reopenlist = rep;
1766 rep->re_vp = vp;
1767 rep->re_osp = kmem_zalloc(
1768 numosp * sizeof (*(rep->re_osp)),
1769 KM_SLEEP);
1770 rep->re_numosp = numosp;
1772 j = 0;
1773 for (osp = list_head(&rp->r_open_streams);
1774 osp != NULL;
1775 osp = list_next(&rp->r_open_streams, osp)) {
1777 mutex_enter(&osp->os_sync_lock);
1778 if (osp->os_valid &&
1779 !osp->os_pending_close) {
1780 osp->os_ref_count++;
1781 rep->re_osp[j] = osp;
1782 j++;
1784 mutex_exit(&osp->os_sync_lock);
1787 * Assuming valid osp(s) stays valid between
1788 * the time obtaining j and numosp.
1790 ASSERT(j == numosp);
1793 mutex_exit(&rp->r_os_lock);
1794 /* do this here to keep v_lock > r_os_lock */
1795 if (hold_vnode)
1796 VN_HOLD(vp);
1797 mutex_enter(&rp->r_statev4_lock);
1798 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1800 * If this rnode holds a delegation,
1801 * but if there are no valid open streams,
1802 * then just discard the delegation
1803 * without doing delegreturn.
1805 if (numosp > 0)
1806 rp->r_deleg_needs_recovery =
1807 rp->r_deleg_type;
1809 /* Save the delegation type for use outside the lock */
1810 dtype = rp->r_deleg_type;
1811 mutex_exit(&rp->r_statev4_lock);
1814 * If we have a delegation then get rid of it.
1815 * We've set rp->r_deleg_needs_recovery so we have
1816 * enough information to recover.
1818 if (dtype != OPEN_DELEGATE_NONE) {
1819 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1822 rw_exit(&rtable4[index].r_lock);
1824 return (reopenlist);
1828 * Given a filesystem id, check to see if any rnodes
1829 * within this fsid reside in the rnode cache, other
1830 * than one we know about.
1832 * Return 1 if an rnode is found, 0 otherwise
1835 r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid)
1837 rnode4_t *rp;
1838 vnode_t *vp;
1839 vfs_t *vfsp = mi->mi_vfsp;
1840 fattr4_fsid *fsid;
1841 int index, found = 0;
1843 for (index = 0; index < rtable4size; index++) {
1844 rw_enter(&rtable4[index].r_lock, RW_READER);
1845 for (rp = rtable4[index].r_hashf;
1846 rp != (rnode4_t *)(&rtable4[index]);
1847 rp = rp->r_hashf) {
1849 vp = RTOV4(rp);
1850 if (vp->v_vfsp != vfsp)
1851 continue;
1854 * XXX there might be a case where a
1855 * replicated fs may have the same fsid
1856 * across two different servers. This
1857 * check isn't good enough in that case
1859 fsid = &rp->r_srv_fsid;
1860 if (FATTR4_FSID_EQ(moved_fsid, fsid)) {
1861 found = 1;
1862 break;
1865 rw_exit(&rtable4[index].r_lock);
1867 if (found)
1868 break;
1870 return (found);
1874 * Release the list of open instance references.
1877 void
1878 r4releopenlist(nfs4_opinst_t *reopenp)
1880 nfs4_opinst_t *rep, *next;
1881 int i;
1883 for (rep = reopenp; rep; rep = next) {
1884 next = rep->re_next;
1886 for (i = 0; i < rep->re_numosp; i++)
1887 open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1889 VN_RELE(rep->re_vp);
1890 kmem_free(rep->re_osp,
1891 rep->re_numosp * sizeof (*(rep->re_osp)));
1893 kmem_free(rep, sizeof (*rep));
1898 nfs4_rnode_init(void)
1900 ulong_t nrnode4_max;
1901 int i;
1904 * Compute the size of the rnode4 hash table
1906 if (nrnode <= 0)
1907 nrnode = ncsize;
1908 nrnode4_max =
1909 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1910 if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1911 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1912 "!setting nrnode to max value of %ld", nrnode4_max);
1913 nrnode = nrnode4_max;
1915 rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1916 rtable4mask = rtable4size - 1;
1919 * Allocate and initialize the hash buckets
1921 rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1922 for (i = 0; i < rtable4size; i++) {
1923 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1924 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1925 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1928 rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1929 0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1931 return (0);
1935 nfs4_rnode_fini(void)
1937 int i;
1940 * Deallocate the rnode hash queues
1942 kmem_cache_destroy(rnode4_cache);
1944 for (i = 0; i < rtable4size; i++)
1945 rw_destroy(&rtable4[i].r_lock);
1947 kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1949 return (0);
1953 * Return non-zero if the given filehandle refers to the root filehandle
1954 * for the given rnode.
1957 static int
1958 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1960 int isroot;
1962 isroot = 0;
1963 if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1964 isroot = 1;
1966 return (isroot);
1970 * The r4_stub_* routines assume that the rnode is newly activated, and
1971 * that the caller either holds the hash bucket r_lock for this rnode as
1972 * RW_WRITER, or holds r_statelock.
1974 static void
1975 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type)
1977 vnode_t *vp = RTOV4(rp);
1978 krwlock_t *hash_lock = &rp->r_hashq->r_lock;
1980 ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock));
1982 rp->r_stub_type = type;
1985 * Safely switch this vnode to the trigger vnodeops.
1987 * Currently, we don't ever switch a trigger vnode back to using
1988 * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1989 * a new v4 object is not a trigger, and it will already have the
1990 * correct v4 vnodeops by default. So, no "else" case required here.
1992 if (type != NFS4_STUB_NONE)
1993 vn_setops(vp, &nfs4_trigger_vnodeops);
1996 void
1997 r4_stub_mirrormount(rnode4_t *rp)
1999 r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT);
2002 void
2003 r4_stub_referral(rnode4_t *rp)
2005 DTRACE_PROBE1(nfs4clnt__func__referral__moved,
2006 vnode_t *, RTOV4(rp));
2007 r4_stub_set(rp, NFS4_STUB_REFERRAL);
2010 void
2011 r4_stub_none(rnode4_t *rp)
2013 r4_stub_set(rp, NFS4_STUB_NONE);
2016 #ifdef DEBUG
2019 * Look in the rnode table for other rnodes that have the same filehandle.
2020 * Assume the lock is held for the hash chain of checkrp
2023 static void
2024 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
2026 rnode4_t *rp;
2027 vnode_t *tvp;
2028 nfs4_fhandle_t fh, fh2;
2029 int index;
2031 if (!r4_check_for_dups)
2032 return;
2034 ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
2036 sfh4_copyval(checkrp->r_fh, &fh);
2038 for (index = 0; index < rtable4size; index++) {
2040 if (&rtable4[index] != checkrp->r_hashq)
2041 rw_enter(&rtable4[index].r_lock, RW_READER);
2043 for (rp = rtable4[index].r_hashf;
2044 rp != (rnode4_t *)(&rtable4[index]);
2045 rp = rp->r_hashf) {
2047 if (rp == checkrp)
2048 continue;
2050 tvp = RTOV4(rp);
2051 if (tvp->v_vfsp != vfsp)
2052 continue;
2054 sfh4_copyval(rp->r_fh, &fh2);
2055 if (nfs4cmpfhandle(&fh, &fh2) == 0) {
2056 cmn_err(CE_PANIC, "rnodes with same fs, fh "
2057 "(%p, %p)", (void *)checkrp, (void *)rp);
2061 if (&rtable4[index] != checkrp->r_hashq)
2062 rw_exit(&rtable4[index].r_lock);
2066 #endif /* DEBUG */