kernel/fs/nfs/nfs4_rnode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28  *      All Rights Reserved
  29  */
  30
  31 /*
  32  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  33  * Copyright (c) 2017 by Delphix. All rights reserved.
  34  */
  35
  36 #include <sys/param.h>
  37 #include <sys/types.h>
  38 #include <sys/systm.h>
  39 #include <sys/cred.h>
  40 #include <sys/proc.h>
  41 #include <sys/user.h>
  42 #include <sys/time.h>
  43 #include <sys/buf.h>
  44 #include <sys/vfs.h>
  45 #include <sys/vnode.h>
  46 #include <sys/socket.h>
  47 #include <sys/uio.h>
  48 #include <sys/tiuser.h>
  49 #include <sys/swap.h>
  50 #include <sys/errno.h>
  51 #include <sys/debug.h>
  52 #include <sys/kmem.h>
  53 #include <sys/kstat.h>
  54 #include <sys/cmn_err.h>
  55 #include <sys/vtrace.h>
  56 #include <sys/session.h>
  57 #include <sys/dnlc.h>
  58 #include <sys/bitmap.h>
  59 #include <sys/acl.h>
  60 #include <sys/ddi.h>
  61 #include <sys/pathname.h>
  62 #include <sys/flock.h>
  63 #include <sys/dirent.h>
  64 #include <sys/flock.h>
  65 #include <sys/callb.h>
  66 #include <sys/sdt.h>
  67
  68 #include <vm/pvn.h>
  69
  70 #include <rpc/types.h>
  71 #include <rpc/xdr.h>
  72 #include <rpc/auth.h>
  73 #include <rpc/rpcsec_gss.h>
  74 #include <rpc/clnt.h>
  75
  76 #include <nfs/nfs.h>
  77 #include <nfs/nfs_clnt.h>
  78 #include <nfs/nfs_acl.h>
  79
  80 #include <nfs/nfs4.h>
  81 #include <nfs/rnode4.h>
  82 #include <nfs/nfs4_clnt.h>
  83
  84 /*
  85  * The hash queues for the access to active and cached rnodes
  86  * are organized as doubly linked lists.  A reader/writer lock
  87  * for each hash bucket is used to control access and to synchronize
  88  * lookups, additions, and deletions from the hash queue.
  89  *
  90  * The rnode freelist is organized as a doubly linked list with
  91  * a head pointer.  Additions and deletions are synchronized via
  92  * a single mutex.
  93  *
  94  * In order to add an rnode to the free list, it must be hashed into
  95  * a hash queue and the exclusive lock to the hash queue be held.
  96  * If an rnode is not hashed into a hash queue, then it is destroyed
  97  * because it represents no valuable information that can be reused
  98  * about the file.  The exclusive lock to the hash queue must be
  99  * held in order to prevent a lookup in the hash queue from finding
 100  * the rnode and using it and assuming that the rnode is not on the
 101  * freelist.  The lookup in the hash queue will have the hash queue
 102  * locked, either exclusive or shared.
 103  *
 104  * The vnode reference count for each rnode is not allowed to drop
 105  * below 1.  This prevents external entities, such as the VM
 106  * subsystem, from acquiring references to vnodes already on the
 107  * freelist and then trying to place them back on the freelist
 108  * when their reference is released.  This means that the when an
 109  * rnode is looked up in the hash queues, then either the rnode
 110  * is removed from the freelist and that reference is transferred to
 111  * the new reference or the vnode reference count must be incremented
 112  * accordingly.  The mutex for the freelist must be held in order to
 113  * accurately test to see if the rnode is on the freelist or not.
 114  * The hash queue lock might be held shared and it is possible that
 115  * two different threads may race to remove the rnode from the
 116  * freelist.  This race can be resolved by holding the mutex for the
 117  * freelist.  Please note that the mutex for the freelist does not
 118  * need to be held if the rnode is not on the freelist.  It can not be
 119  * placed on the freelist due to the requirement that the thread
 120  * putting the rnode on the freelist must hold the exclusive lock
 121  * to the hash queue and the thread doing the lookup in the hash
 122  * queue is holding either a shared or exclusive lock to the hash
 123  * queue.
 124  *
 125  * The lock ordering is:
 126  *
 127  *      hash bucket lock -> vnode lock
 128  *      hash bucket lock -> freelist lock -> r_statelock
 129  */
 130 r4hashq_t *rtable4;
 131
 132 static kmutex_t rp4freelist_lock;
 133 static rnode4_t *rp4freelist = NULL;
 134 static long rnode4_new = 0;
 135 int rtable4size;
 136 static int rtable4mask;
 137 static struct kmem_cache *rnode4_cache;
 138 static int rnode4_hashlen = 4;
 139
 140 static void     r4inactive(rnode4_t *, cred_t *);
 141 static vnode_t  *make_rnode4(nfs4_sharedfh_t *, r4hashq_t *, struct vfs *,
 142                     const struct vnodeops *,
 143                     int (*)(vnode_t *, page_t *, uoff_t *, size_t *, int,
 144                     cred_t *),
 145                     int *, cred_t *);
 146 static void     rp4_rmfree(rnode4_t *);
 147 int             nfs4_free_data_reclaim(rnode4_t *);
 148 static int      nfs4_active_data_reclaim(rnode4_t *);
 149 static int      nfs4_free_reclaim(void);
 150 static int      nfs4_active_reclaim(void);
 151 static int      nfs4_rnode_reclaim(void);
 152 static void     nfs4_reclaim(void *);
 153 static int      isrootfh(nfs4_sharedfh_t *, rnode4_t *);
 154 static void     uninit_rnode4(rnode4_t *);
 155 static void     destroy_rnode4(rnode4_t *);
 156 static void     r4_stub_set(rnode4_t *, nfs4_stub_type_t);
 157
 158 #ifdef DEBUG
 159 static int r4_check_for_dups = 0; /* Flag to enable dup rnode detection. */
 160 static int nfs4_rnode_debug = 0;
 161 /* if nonzero, kmem_cache_free() rnodes rather than place on freelist */
 162 static int nfs4_rnode_nofreelist = 0;
 163 /* give messages on colliding shared filehandles */
 164 static void     r4_dup_check(rnode4_t *, vfs_t *);
 165 #endif
 166
 167 /*
 168  * If the vnode has pages, run the list and check for any that are
 169  * still dangling.  We call this routine before putting an rnode on
 170  * the free list.
 171  */
 172 static int
 173 nfs4_dross_pages(vnode_t *vp)
 174 {
 175         page_t *pp;
 176
 177         vmobject_lock(&vp->v_object);
 178         for (pp = vmobject_get_head(&vp->v_object);
 179              pp != NULL;
 180              pp = vmobject_get_next(&vp->v_object, pp)) {
 181                 if (PP_ISPVN_TAG(pp) &&
 182                     pp->p_fsdata != C_NOCOMMIT) {
 183                         vmobject_unlock(&vp->v_object);
 184                         return (1);
 185                 }
 186         }
 187         vmobject_unlock(&vp->v_object);
 188
 189         return (0);
 190 }
 191
 192 /*
 193  * Flush any pages left on this rnode.
 194  */
 195 static void
 196 r4flushpages(rnode4_t *rp, cred_t *cr)
 197 {
 198         vnode_t *vp;
 199         int error;
 200
 201         /*
 202          * Before freeing anything, wait until all asynchronous
 203          * activity is done on this rnode.  This will allow all
 204          * asynchronous read ahead and write behind i/o's to
 205          * finish.
 206          */
 207         mutex_enter(&rp->r_statelock);
 208         while (rp->r_count > 0)
 209                 cv_wait(&rp->r_cv, &rp->r_statelock);
 210         mutex_exit(&rp->r_statelock);
 211
 212         /*
 213          * Flush and invalidate all pages associated with the vnode.
 214          */
 215         vp = RTOV4(rp);
 216         if (nfs4_has_pages(vp)) {
 217                 ASSERT(vp->v_type != VCHR);
 218                 if ((rp->r_flags & R4DIRTY) && !rp->r_error) {
 219                         error = fop_putpage(vp, 0, 0, 0, cr, NULL);
 220                         if (error && (error == ENOSPC || error == EDQUOT)) {
 221                                 mutex_enter(&rp->r_statelock);
 222                                 if (!rp->r_error)
 223                                         rp->r_error = error;
 224                                 mutex_exit(&rp->r_statelock);
 225                         }
 226                 }
 227                 nfs4_invalidate_pages(vp, 0, cr);
 228         }
 229 }
 230
 231 /*
 232  * Free the resources associated with an rnode.
 233  */
 234 static void
 235 r4inactive(rnode4_t *rp, cred_t *cr)
 236 {
 237         vnode_t *vp;
 238         char *contents;
 239         int size;
 240         vsecattr_t *vsp;
 241         vnode_t *xattr;
 242
 243         r4flushpages(rp, cr);
 244
 245         vp = RTOV4(rp);
 246
 247         /*
 248          * Free any held caches which may be
 249          * associated with this rnode.
 250          */
 251         mutex_enter(&rp->r_statelock);
 252         contents = rp->r_symlink.contents;
 253         size = rp->r_symlink.size;
 254         rp->r_symlink.contents = NULL;
 255         vsp = rp->r_secattr;
 256         rp->r_secattr = NULL;
 257         xattr = rp->r_xattr_dir;
 258         rp->r_xattr_dir = NULL;
 259         mutex_exit(&rp->r_statelock);
 260
 261         /*
 262          * Free the access cache entries.
 263          */
 264         (void) nfs4_access_purge_rp(rp);
 265
 266         /*
 267          * Free the readdir cache entries.
 268          */
 269         nfs4_purge_rddir_cache(vp);
 270
 271         /*
 272          * Free the symbolic link cache.
 273          */
 274         if (contents != NULL) {
 275
 276                 kmem_free((void *)contents, size);
 277         }
 278
 279         /*
 280          * Free any cached ACL.
 281          */
 282         if (vsp != NULL)
 283                 nfs4_acl_free_cache(vsp);
 284
 285         /*
 286          * Release the cached xattr_dir
 287          */
 288         if (xattr != NULL)
 289                 VN_RELE(xattr);
 290 }
 291
 292 /*
 293  * We have seen a case that the fh passed in is for "." which
 294  * should be a VROOT node, however, the fh is different from the
 295  * root fh stored in the mntinfo4_t. The invalid fh might be
 296  * from a misbehaved server and will panic the client system at
 297  * a later time. To avoid the panic, we drop the bad fh, use
 298  * the root fh from mntinfo4_t, and print an error message
 299  * for attention.
 300  */
 301 nfs4_sharedfh_t *
 302 badrootfh_check(nfs4_sharedfh_t *fh, nfs4_fname_t *nm, mntinfo4_t *mi,
 303     int *wasbad)
 304 {
 305         char *s;
 306
 307         *wasbad = 0;
 308         s = fn_name(nm);
 309         ASSERT(strcmp(s, "..") != 0);
 310
 311         if ((s[0] == '.' && s[1] == '\0') && fh &&
 312             !SFH4_SAME(mi->mi_rootfh, fh)) {
 313 #ifdef DEBUG
 314                 nfs4_fhandle_t fhandle;
 315
 316                 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
 317                     "Server %s returns a different "
 318                     "root filehandle for the path %s:",
 319                     mi->mi_curr_serv->sv_hostname,
 320                     mi->mi_curr_serv->sv_path);
 321
 322                 /* print the bad fh */
 323                 fhandle.fh_len = fh->sfh_fh.nfs_fh4_len;
 324                 bcopy(fh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
 325                     fhandle.fh_len);
 326                 nfs4_printfhandle(&fhandle);
 327
 328                 /* print mi_rootfh */
 329                 fhandle.fh_len = mi->mi_rootfh->sfh_fh.nfs_fh4_len;
 330                 bcopy(mi->mi_rootfh->sfh_fh.nfs_fh4_val, fhandle.fh_buf,
 331                     fhandle.fh_len);
 332                 nfs4_printfhandle(&fhandle);
 333 #endif
 334                 /* use mi_rootfh instead; fh will be rele by the caller */
 335                 fh = mi->mi_rootfh;
 336                 *wasbad = 1;
 337         }
 338
 339         kmem_free(s, MAXNAMELEN);
 340         return (fh);
 341 }
 342
 343 void
 344 r4_do_attrcache(vnode_t *vp, nfs4_ga_res_t *garp, int newnode,
 345     hrtime_t t, cred_t *cr, int index)
 346 {
 347         int is_stub;
 348         vattr_t *attr;
 349         /*
 350          * Don't add to attrcache if time overflow, but
 351          * no need to check because either attr is null or the time
 352          * values in it were processed by nfs4_time_ntov(), which checks
 353          * for time overflows.
 354          */
 355         attr = garp ? &garp->n4g_va : NULL;
 356
 357         if (attr) {
 358                 if (!newnode) {
 359                         rw_exit(&rtable4[index].r_lock);
 360 #ifdef DEBUG
 361                         if (vp->v_type != attr->va_type &&
 362                             vp->v_type != VNON && attr->va_type != VNON) {
 363                                 zcmn_err(VTOMI4(vp)->mi_zone->zone_id, CE_WARN,
 364                                     "makenfs4node: type (%d) doesn't "
 365                                     "match type of found node at %p (%d)",
 366                                     attr->va_type, (void *)vp, vp->v_type);
 367                         }
 368 #endif
 369                         nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
 370                 } else {
 371                         rnode4_t *rp = VTOR4(vp);
 372
 373                         vp->v_type = attr->va_type;
 374                         vp->v_rdev = attr->va_rdev;
 375
 376                         /*
 377                          * Turn this object into a "stub" object if we
 378                          * crossed an underlying server fs boundary.
 379                          * To make this check, during mount we save the
 380                          * fsid of the server object being mounted.
 381                          * Here we compare this object's server fsid
 382                          * with the fsid we saved at mount.  If they
 383                          * are different, we crossed server fs boundary.
 384                          *
 385                          * The stub type is set (or not) at rnode
 386                          * creation time and it never changes for life
 387                          * of the rnode.
 388                          *
 389                          * This stub will be for a mirror-mount, rather than
 390                          * a referral (the latter also sets R4SRVSTUB).
 391                          *
 392                          * The stub type is also set during RO failover,
 393                          * nfs4_remap_file().
 394                          *
 395                          * We don't bother with taking r_state_lock to
 396                          * set the stub type because this is a new rnode
 397                          * and we're holding the hash bucket r_lock RW_WRITER.
 398                          * No other thread could have obtained access
 399                          * to this rnode.
 400                          */
 401                         is_stub = 0;
 402                         if (garp->n4g_fsid_valid) {
 403                                 fattr4_fsid ga_fsid = garp->n4g_fsid;
 404                                 servinfo4_t *svp = rp->r_server;
 405
 406                                 rp->r_srv_fsid = ga_fsid;
 407
 408                                 (void) nfs_rw_enter_sig(&svp->sv_lock,
 409                                     RW_READER, 0);
 410                                 if (!FATTR4_FSID_EQ(&ga_fsid, &svp->sv_fsid))
 411                                         is_stub = 1;
 412                                 nfs_rw_exit(&svp->sv_lock);
 413                         }
 414
 415                         if (is_stub)
 416                                 r4_stub_mirrormount(rp);
 417                         else
 418                                 r4_stub_none(rp);
 419
 420                         /* Can not cache partial attr */
 421                         if (attr->va_mask == AT_ALL)
 422                                 nfs4_attrcache_noinval(vp, garp, t);
 423                         else
 424                                 PURGE_ATTRCACHE4(vp);
 425
 426                         rw_exit(&rtable4[index].r_lock);
 427                 }
 428         } else {
 429                 if (newnode) {
 430                         PURGE_ATTRCACHE4(vp);
 431                 }
 432                 rw_exit(&rtable4[index].r_lock);
 433         }
 434 }
 435
 436 /*
 437  * Find or create an rnode based primarily on filehandle.  To be
 438  * used when dvp (vnode for parent directory) is not available;
 439  * otherwise, makenfs4node() should be used.
 440  *
 441  * The nfs4_fname_t argument *npp is consumed and nulled out.
 442  */
 443
 444 vnode_t *
 445 makenfs4node_by_fh(nfs4_sharedfh_t *sfh, nfs4_sharedfh_t *psfh,
 446     nfs4_fname_t **npp, nfs4_ga_res_t *garp,
 447     mntinfo4_t *mi, cred_t *cr, hrtime_t t)
 448 {
 449         vfs_t *vfsp = mi->mi_vfsp;
 450         int newnode = 0;
 451         vnode_t *vp;
 452         rnode4_t *rp;
 453         svnode_t *svp;
 454         nfs4_fname_t *name, *svpname;
 455         int index;
 456
 457         ASSERT(npp && *npp);
 458         name = *npp;
 459         *npp = NULL;
 460
 461         index = rtable4hash(sfh);
 462         rw_enter(&rtable4[index].r_lock, RW_READER);
 463
 464         vp = make_rnode4(sfh, &rtable4[index], vfsp,
 465             &nfs4_vnodeops, nfs4_putapage, &newnode, cr);
 466
 467         svp = VTOSV(vp);
 468         rp = VTOR4(vp);
 469         if (newnode) {
 470                 svp->sv_forw = svp->sv_back = svp;
 471                 svp->sv_name = name;
 472                 if (psfh != NULL)
 473                         sfh4_hold(psfh);
 474                 svp->sv_dfh = psfh;
 475         } else {
 476                 /*
 477                  * It is possible that due to a server
 478                  * side rename fnames have changed.
 479                  * update the fname here.
 480                  */
 481                 mutex_enter(&rp->r_svlock);
 482                 svpname = svp->sv_name;
 483                 if (svp->sv_name != name) {
 484                         svp->sv_name = name;
 485                         mutex_exit(&rp->r_svlock);
 486                         fn_rele(&svpname);
 487                 } else {
 488                         mutex_exit(&rp->r_svlock);
 489                         fn_rele(&name);
 490                 }
 491         }
 492
 493         ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
 494         r4_do_attrcache(vp, garp, newnode, t, cr, index);
 495         ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
 496
 497         return (vp);
 498 }
 499
 500 /*
 501  * Find or create a vnode for the given filehandle, filesystem, parent, and
 502  * name.  The reference to nm is consumed, so the caller must first do an
 503  * fn_hold() if it wants to continue using nm after this call.
 504  */
 505 vnode_t *
 506 makenfs4node(nfs4_sharedfh_t *fh, nfs4_ga_res_t *garp, struct vfs *vfsp,
 507     hrtime_t t, cred_t *cr, vnode_t *dvp, nfs4_fname_t *nm)
 508 {
 509         vnode_t *vp;
 510         int newnode;
 511         int index;
 512         mntinfo4_t *mi = VFTOMI4(vfsp);
 513         int had_badfh = 0;
 514         rnode4_t *rp;
 515
 516         ASSERT(dvp != NULL);
 517
 518         fh = badrootfh_check(fh, nm, mi, &had_badfh);
 519
 520         index = rtable4hash(fh);
 521         rw_enter(&rtable4[index].r_lock, RW_READER);
 522
 523         /*
 524          * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
 525          */
 526         vp = make_rnode4(fh, &rtable4[index], vfsp, &nfs4_vnodeops,
 527             nfs4_putapage, &newnode, cr);
 528
 529         rp = VTOR4(vp);
 530         sv_activate(&vp, dvp, &nm, newnode);
 531         if (dvp->v_flag & V_XATTRDIR) {
 532                 mutex_enter(&rp->r_statelock);
 533                 rp->r_flags |= R4ISXATTR;
 534                 mutex_exit(&rp->r_statelock);
 535         }
 536
 537         /* if getting a bad file handle, do not cache the attributes. */
 538         if (had_badfh) {
 539                 rw_exit(&rtable4[index].r_lock);
 540                 return (vp);
 541         }
 542
 543         ASSERT(RW_LOCK_HELD(&rtable4[index].r_lock));
 544         r4_do_attrcache(vp, garp, newnode, t, cr, index);
 545         ASSERT(rw_owner(&rtable4[index].r_lock) != curthread);
 546
 547         return (vp);
 548 }
 549
 550 /*
 551  * Hash on address of filehandle object.
 552  * XXX totally untuned.
 553  */
 554
 555 int
 556 rtable4hash(nfs4_sharedfh_t *fh)
 557 {
 558         return (((uintptr_t)fh / sizeof (*fh)) & rtable4mask);
 559 }
 560
 561 /*
 562  * Find or create the vnode for the given filehandle and filesystem.
 563  * *newnode is set to zero if the vnode already existed; non-zero if it had
 564  * to be created.
 565  *
 566  * Note: make_rnode4() may upgrade the hash bucket lock to exclusive.
 567  */
 568
 569 static vnode_t *
 570 make_rnode4(nfs4_sharedfh_t *fh, r4hashq_t *rhtp, struct vfs *vfsp,
 571     const struct vnodeops *vops,
 572     int (*putapage)(vnode_t *, page_t *, uoff_t *, size_t *, int, cred_t *),
 573     int *newnode, cred_t *cr)
 574 {
 575         rnode4_t *rp;
 576         rnode4_t *trp;
 577         vnode_t *vp;
 578         mntinfo4_t *mi;
 579
 580         ASSERT(RW_READ_HELD(&rhtp->r_lock));
 581
 582         mi = VFTOMI4(vfsp);
 583
 584 start:
 585         if ((rp = r4find(rhtp, fh, vfsp)) != NULL) {
 586                 vp = RTOV4(rp);
 587                 *newnode = 0;
 588                 return (vp);
 589         }
 590         rw_exit(&rhtp->r_lock);
 591
 592         mutex_enter(&rp4freelist_lock);
 593
 594         if (rp4freelist != NULL && rnode4_new >= nrnode) {
 595                 rp = rp4freelist;
 596                 rp4_rmfree(rp);
 597                 mutex_exit(&rp4freelist_lock);
 598
 599                 vp = RTOV4(rp);
 600
 601                 if (rp->r_flags & R4HASHED) {
 602                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
 603                         mutex_enter(&vp->v_lock);
 604                         if (vp->v_count > 1) {
 605                                 VN_RELE_LOCKED(vp);
 606                                 mutex_exit(&vp->v_lock);
 607                                 rw_exit(&rp->r_hashq->r_lock);
 608                                 rw_enter(&rhtp->r_lock, RW_READER);
 609                                 goto start;
 610                         }
 611                         mutex_exit(&vp->v_lock);
 612                         rp4_rmhash_locked(rp);
 613                         rw_exit(&rp->r_hashq->r_lock);
 614                 }
 615
 616                 r4inactive(rp, cr);
 617
 618                 mutex_enter(&vp->v_lock);
 619                 if (vp->v_count > 1) {
 620                         VN_RELE_LOCKED(vp);
 621                         mutex_exit(&vp->v_lock);
 622                         rw_enter(&rhtp->r_lock, RW_READER);
 623                         goto start;
 624                 }
 625                 mutex_exit(&vp->v_lock);
 626                 vn_invalid(vp);
 627
 628                 /*
 629                  * destroy old locks before bzero'ing and
 630                  * recreating the locks below.
 631                  */
 632                 uninit_rnode4(rp);
 633
 634                 /*
 635                  * Make sure that if rnode is recycled then
 636                  * VFS count is decremented properly before
 637                  * reuse.
 638                  */
 639                 VFS_RELE(vp->v_vfsp);
 640                 vn_reinit(vp);
 641         } else {
 642                 vnode_t *new_vp;
 643
 644                 mutex_exit(&rp4freelist_lock);
 645
 646                 rp = kmem_cache_alloc(rnode4_cache, KM_SLEEP);
 647                 new_vp = vn_alloc(KM_SLEEP);
 648
 649                 atomic_inc_ulong((ulong_t *)&rnode4_new);
 650 #ifdef DEBUG
 651                 clstat4_debug.nrnode.value.ui64++;
 652 #endif
 653                 vp = new_vp;
 654         }
 655
 656         bzero(rp, sizeof (*rp));
 657         rp->r_vnode = vp;
 658         nfs_rw_init(&rp->r_rwlock, NULL, RW_DEFAULT, NULL);
 659         nfs_rw_init(&rp->r_lkserlock, NULL, RW_DEFAULT, NULL);
 660         mutex_init(&rp->r_svlock, NULL, MUTEX_DEFAULT, NULL);
 661         mutex_init(&rp->r_statelock, NULL, MUTEX_DEFAULT, NULL);
 662         mutex_init(&rp->r_statev4_lock, NULL, MUTEX_DEFAULT, NULL);
 663         mutex_init(&rp->r_os_lock, NULL, MUTEX_DEFAULT, NULL);
 664         rp->created_v4 = 0;
 665         list_create(&rp->r_open_streams, sizeof (nfs4_open_stream_t),
 666             offsetof(nfs4_open_stream_t, os_node));
 667         rp->r_lo_head.lo_prev_rnode = &rp->r_lo_head;
 668         rp->r_lo_head.lo_next_rnode = &rp->r_lo_head;
 669         cv_init(&rp->r_cv, NULL, CV_DEFAULT, NULL);
 670         cv_init(&rp->r_commit.c_cv, NULL, CV_DEFAULT, NULL);
 671         rp->r_flags = R4READDIRWATTR;
 672         rp->r_fh = fh;
 673         rp->r_hashq = rhtp;
 674         sfh4_hold(rp->r_fh);
 675         rp->r_server = mi->mi_curr_serv;
 676         rp->r_deleg_type = OPEN_DELEGATE_NONE;
 677         rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
 678         nfs_rw_init(&rp->r_deleg_recall_lock, NULL, RW_DEFAULT, NULL);
 679
 680         rddir4_cache_create(rp);
 681         rp->r_putapage = putapage;
 682         vn_setops(vp, vops);
 683         vp->v_data = (caddr_t)rp;
 684         vp->v_vfsp = vfsp;
 685         VFS_HOLD(vfsp);
 686         vp->v_type = VNON;
 687         vp->v_flag |= VMODSORT;
 688         if (isrootfh(fh, rp))
 689                 vp->v_flag = VROOT;
 690         vn_exists(vp);
 691
 692         /*
 693          * There is a race condition if someone else
 694          * alloc's the rnode while no locks are held, so we
 695          * check again and recover if found.
 696          */
 697         rw_enter(&rhtp->r_lock, RW_WRITER);
 698         if ((trp = r4find(rhtp, fh, vfsp)) != NULL) {
 699                 vp = RTOV4(trp);
 700                 *newnode = 0;
 701                 rw_exit(&rhtp->r_lock);
 702                 rp4_addfree(rp, cr);
 703                 rw_enter(&rhtp->r_lock, RW_READER);
 704                 return (vp);
 705         }
 706         rp4_addhash(rp);
 707         *newnode = 1;
 708         return (vp);
 709 }
 710
 711 static void
 712 uninit_rnode4(rnode4_t *rp)
 713 {
 714         vnode_t *vp = RTOV4(rp);
 715
 716         ASSERT(rp != NULL);
 717         ASSERT(vp != NULL);
 718         ASSERT(vp->v_count == 1);
 719         ASSERT(rp->r_count == 0);
 720         ASSERT(rp->r_mapcnt == 0);
 721         if (rp->r_flags & R4LODANGLERS) {
 722                 nfs4_flush_lock_owners(rp);
 723         }
 724         ASSERT(rp->r_lo_head.lo_next_rnode == &rp->r_lo_head);
 725         ASSERT(rp->r_lo_head.lo_prev_rnode == &rp->r_lo_head);
 726         ASSERT(!(rp->r_flags & R4HASHED));
 727         ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
 728         nfs4_clear_open_streams(rp);
 729         list_destroy(&rp->r_open_streams);
 730
 731         /*
 732          * Destroy the rddir cache first since we need to grab the r_statelock.
 733          */
 734         mutex_enter(&rp->r_statelock);
 735         rddir4_cache_destroy(rp);
 736         mutex_exit(&rp->r_statelock);
 737         sv_uninit(&rp->r_svnode);
 738         sfh4_rele(&rp->r_fh);
 739         nfs_rw_destroy(&rp->r_rwlock);
 740         nfs_rw_destroy(&rp->r_lkserlock);
 741         mutex_destroy(&rp->r_statelock);
 742         mutex_destroy(&rp->r_statev4_lock);
 743         mutex_destroy(&rp->r_os_lock);
 744         cv_destroy(&rp->r_cv);
 745         cv_destroy(&rp->r_commit.c_cv);
 746         nfs_rw_destroy(&rp->r_deleg_recall_lock);
 747         if (rp->r_flags & R4DELMAPLIST)
 748                 list_destroy(&rp->r_indelmap);
 749 }
 750
 751 /*
 752  * Put an rnode on the free list.
 753  *
 754  * Rnodes which were allocated above and beyond the normal limit
 755  * are immediately freed.
 756  */
 757 void
 758 rp4_addfree(rnode4_t *rp, cred_t *cr)
 759 {
 760         vnode_t *vp;
 761         vnode_t *xattr;
 762         struct vfs *vfsp;
 763
 764         vp = RTOV4(rp);
 765         ASSERT(vp->v_count >= 1);
 766         ASSERT(rp->r_freef == NULL && rp->r_freeb == NULL);
 767
 768         /*
 769          * If we have too many rnodes allocated and there are no
 770          * references to this rnode, or if the rnode is no longer
 771          * accessible by it does not reside in the hash queues,
 772          * or if an i/o error occurred while writing to the file,
 773          * then just free it instead of putting it on the rnode
 774          * freelist.
 775          */
 776         vfsp = vp->v_vfsp;
 777         if (((rnode4_new > nrnode || !(rp->r_flags & R4HASHED) ||
 778 #ifdef DEBUG
 779             (nfs4_rnode_nofreelist != 0) ||
 780 #endif
 781             rp->r_error || (rp->r_flags & R4RECOVERR) ||
 782             (vfsp->vfs_flag & VFS_UNMOUNTED)) && rp->r_count == 0)) {
 783                 if (rp->r_flags & R4HASHED) {
 784                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
 785                         mutex_enter(&vp->v_lock);
 786                         if (vp->v_count > 1) {
 787                                 VN_RELE_LOCKED(vp);
 788                                 mutex_exit(&vp->v_lock);
 789                                 rw_exit(&rp->r_hashq->r_lock);
 790                                 return;
 791                         }
 792                         mutex_exit(&vp->v_lock);
 793                         rp4_rmhash_locked(rp);
 794                         rw_exit(&rp->r_hashq->r_lock);
 795                 }
 796
 797                 /*
 798                  * Make sure we don't have a delegation on this rnode
 799                  * before destroying it.
 800                  */
 801                 if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
 802                         (void) nfs4delegreturn(rp,
 803                             NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
 804                 }
 805
 806                 r4inactive(rp, cr);
 807
 808                 /*
 809                  * Recheck the vnode reference count.  We need to
 810                  * make sure that another reference has not been
 811                  * acquired while we were not holding v_lock.  The
 812                  * rnode is not in the rnode hash queues; one
 813                  * way for a reference to have been acquired
 814                  * is for a fop_putpage because the rnode was marked
 815                  * with R4DIRTY or for a modified page.  This
 816                  * reference may have been acquired before our call
 817                  * to r4inactive.  The i/o may have been completed,
 818                  * thus allowing r4inactive to complete, but the
 819                  * reference to the vnode may not have been released
 820                  * yet.  In any case, the rnode can not be destroyed
 821                  * until the other references to this vnode have been
 822                  * released.  The other references will take care of
 823                  * either destroying the rnode or placing it on the
 824                  * rnode freelist.  If there are no other references,
 825                  * then the rnode may be safely destroyed.
 826                  */
 827                 mutex_enter(&vp->v_lock);
 828                 if (vp->v_count > 1) {
 829                         VN_RELE_LOCKED(vp);
 830                         mutex_exit(&vp->v_lock);
 831                         return;
 832                 }
 833                 mutex_exit(&vp->v_lock);
 834
 835                 destroy_rnode4(rp);
 836                 return;
 837         }
 838
 839         /*
 840          * Lock the hash queue and then recheck the reference count
 841          * to ensure that no other threads have acquired a reference
 842          * to indicate that the rnode should not be placed on the
 843          * freelist.  If another reference has been acquired, then
 844          * just release this one and let the other thread complete
 845          * the processing of adding this rnode to the freelist.
 846          */
 847 again:
 848         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
 849
 850         mutex_enter(&vp->v_lock);
 851         if (vp->v_count > 1) {
 852                 VN_RELE_LOCKED(vp);
 853                 mutex_exit(&vp->v_lock);
 854                 rw_exit(&rp->r_hashq->r_lock);
 855                 return;
 856         }
 857         mutex_exit(&vp->v_lock);
 858
 859         /*
 860          * Make sure we don't put an rnode with a delegation
 861          * on the free list.
 862          */
 863         if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
 864                 rw_exit(&rp->r_hashq->r_lock);
 865                 (void) nfs4delegreturn(rp,
 866                     NFS4_DR_FORCE|NFS4_DR_PUSH|NFS4_DR_REOPEN);
 867                 goto again;
 868         }
 869
 870         /*
 871          * Now that we have the hash queue lock, and we know there
 872          * are not anymore references on the vnode, check to make
 873          * sure there aren't any open streams still on the rnode.
 874          * If so, drop the hash queue lock, remove the open streams,
 875          * and recheck the v_count.
 876          */
 877         mutex_enter(&rp->r_os_lock);
 878         if (list_head(&rp->r_open_streams) != NULL) {
 879                 mutex_exit(&rp->r_os_lock);
 880                 rw_exit(&rp->r_hashq->r_lock);
 881                 if (nfs_zone() != VTOMI4(vp)->mi_zone)
 882                         nfs4_clear_open_streams(rp);
 883                 else
 884                         (void) nfs4close_all(vp, cr);
 885                 goto again;
 886         }
 887         mutex_exit(&rp->r_os_lock);
 888
 889         /*
 890          * Before we put it on the freelist, make sure there are no pages.
 891          * If there are, flush and commit of all of the dirty and
 892          * uncommitted pages, assuming the file system isn't read only.
 893          */
 894         if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY) && nfs4_dross_pages(vp)) {
 895                 rw_exit(&rp->r_hashq->r_lock);
 896                 r4flushpages(rp, cr);
 897                 goto again;
 898         }
 899
 900         /*
 901          * Before we put it on the freelist, make sure there is no
 902          * active xattr directory cached, the freelist will not
 903          * have its entries r4inactive'd if there is still an active
 904          * rnode, thus nothing in the freelist can hold another
 905          * rnode active.
 906          */
 907         xattr = rp->r_xattr_dir;
 908         rp->r_xattr_dir = NULL;
 909
 910         /*
 911          * If there is no cached data or metadata for this file, then
 912          * put the rnode on the front of the freelist so that it will
 913          * be reused before other rnodes which may have cached data or
 914          * metadata associated with them.
 915          */
 916         mutex_enter(&rp4freelist_lock);
 917         if (rp4freelist == NULL) {
 918                 rp->r_freef = rp;
 919                 rp->r_freeb = rp;
 920                 rp4freelist = rp;
 921         } else {
 922                 rp->r_freef = rp4freelist;
 923                 rp->r_freeb = rp4freelist->r_freeb;
 924                 rp4freelist->r_freeb->r_freef = rp;
 925                 rp4freelist->r_freeb = rp;
 926                 if (!nfs4_has_pages(vp) && rp->r_dir == NULL &&
 927                     rp->r_symlink.contents == NULL && rp->r_secattr == NULL)
 928                         rp4freelist = rp;
 929         }
 930         mutex_exit(&rp4freelist_lock);
 931
 932         rw_exit(&rp->r_hashq->r_lock);
 933
 934         if (xattr)
 935                 VN_RELE(xattr);
 936 }
 937
 938 /*
 939  * Remove an rnode from the free list.
 940  *
 941  * The caller must be holding rp4freelist_lock and the rnode
 942  * must be on the freelist.
 943  */
 944 static void
 945 rp4_rmfree(rnode4_t *rp)
 946 {
 947
 948         ASSERT(MUTEX_HELD(&rp4freelist_lock));
 949         ASSERT(rp->r_freef != NULL && rp->r_freeb != NULL);
 950
 951         if (rp == rp4freelist) {
 952                 rp4freelist = rp->r_freef;
 953                 if (rp == rp4freelist)
 954                         rp4freelist = NULL;
 955         }
 956         rp->r_freeb->r_freef = rp->r_freef;
 957         rp->r_freef->r_freeb = rp->r_freeb;
 958
 959         rp->r_freef = rp->r_freeb = NULL;
 960 }
 961
 962 /*
 963  * Put a rnode in the hash table.
 964  *
 965  * The caller must be holding the exclusive hash queue lock
 966  */
 967 void
 968 rp4_addhash(rnode4_t *rp)
 969 {
 970         mntinfo4_t *mi;
 971
 972         ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
 973         ASSERT(!(rp->r_flags & R4HASHED));
 974
 975 #ifdef DEBUG
 976         r4_dup_check(rp, RTOV4(rp)->v_vfsp);
 977 #endif
 978
 979         rp->r_hashf = rp->r_hashq->r_hashf;
 980         rp->r_hashq->r_hashf = rp;
 981         rp->r_hashb = (rnode4_t *)rp->r_hashq;
 982         rp->r_hashf->r_hashb = rp;
 983
 984         mutex_enter(&rp->r_statelock);
 985         rp->r_flags |= R4HASHED;
 986         mutex_exit(&rp->r_statelock);
 987
 988         mi = VTOMI4(RTOV4(rp));
 989         mutex_enter(&mi->mi_rnodes_lock);
 990         list_insert_tail(&mi->mi_rnodes, rp);
 991         mutex_exit(&mi->mi_rnodes_lock);
 992 }
 993
 994 /*
 995  * Remove a rnode from the hash table.
 996  *
 997  * The caller must be holding the hash queue lock.
 998  */
 999 void
1000 rp4_rmhash_locked(rnode4_t *rp)
1001 {
1002         mntinfo4_t *mi;
1003
1004         ASSERT(RW_WRITE_HELD(&rp->r_hashq->r_lock));
1005         ASSERT(rp->r_flags & R4HASHED);
1006
1007         rp->r_hashb->r_hashf = rp->r_hashf;
1008         rp->r_hashf->r_hashb = rp->r_hashb;
1009
1010         mutex_enter(&rp->r_statelock);
1011         rp->r_flags &= ~R4HASHED;
1012         mutex_exit(&rp->r_statelock);
1013
1014         mi = VTOMI4(RTOV4(rp));
1015         mutex_enter(&mi->mi_rnodes_lock);
1016         if (list_link_active(&rp->r_mi_link))
1017                 list_remove(&mi->mi_rnodes, rp);
1018         mutex_exit(&mi->mi_rnodes_lock);
1019 }
1020
1021 /*
1022  * Remove a rnode from the hash table.
1023  *
1024  * The caller must not be holding the hash queue lock.
1025  */
1026 void
1027 rp4_rmhash(rnode4_t *rp)
1028 {
1029         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1030         rp4_rmhash_locked(rp);
1031         rw_exit(&rp->r_hashq->r_lock);
1032 }
1033
1034 /*
1035  * Lookup a rnode by fhandle.  Ignores rnodes that had failed recovery.
1036  * Returns NULL if no match.  If an rnode is returned, the reference count
1037  * on the master vnode is incremented.
1038  *
1039  * The caller must be holding the hash queue lock, either shared or exclusive.
1040  */
1041 rnode4_t *
1042 r4find(r4hashq_t *rhtp, nfs4_sharedfh_t *fh, struct vfs *vfsp)
1043 {
1044         rnode4_t *rp;
1045         vnode_t *vp;
1046
1047         ASSERT(RW_LOCK_HELD(&rhtp->r_lock));
1048
1049         for (rp = rhtp->r_hashf; rp != (rnode4_t *)rhtp; rp = rp->r_hashf) {
1050                 vp = RTOV4(rp);
1051                 if (vp->v_vfsp == vfsp && SFH4_SAME(rp->r_fh, fh)) {
1052
1053                         mutex_enter(&rp->r_statelock);
1054                         if (rp->r_flags & R4RECOVERR) {
1055                                 mutex_exit(&rp->r_statelock);
1056                                 continue;
1057                         }
1058                         mutex_exit(&rp->r_statelock);
1059 #ifdef DEBUG
1060                         r4_dup_check(rp, vfsp);
1061 #endif
1062                         if (rp->r_freef != NULL) {
1063                                 mutex_enter(&rp4freelist_lock);
1064                                 /*
1065                                  * If the rnode is on the freelist,
1066                                  * then remove it and use that reference
1067                                  * as the new reference.  Otherwise,
1068                                  * need to increment the reference count.
1069                                  */
1070                                 if (rp->r_freef != NULL) {
1071                                         rp4_rmfree(rp);
1072                                         mutex_exit(&rp4freelist_lock);
1073                                 } else {
1074                                         mutex_exit(&rp4freelist_lock);
1075                                         VN_HOLD(vp);
1076                                 }
1077                         } else
1078                                 VN_HOLD(vp);
1079
1080                         /*
1081                          * if root vnode, set v_flag to indicate that
1082                          */
1083                         if (isrootfh(fh, rp)) {
1084                                 if (!(vp->v_flag & VROOT)) {
1085                                         mutex_enter(&vp->v_lock);
1086                                         vp->v_flag |= VROOT;
1087                                         mutex_exit(&vp->v_lock);
1088                                 }
1089                         }
1090                         return (rp);
1091                 }
1092         }
1093         return (NULL);
1094 }
1095
1096 /*
1097  * Lookup an rnode by fhandle. Just a wrapper for r4find()
1098  * that assumes the caller hasn't already got the lock
1099  * on the hash bucket.
1100  */
1101 rnode4_t *
1102 r4find_unlocked(nfs4_sharedfh_t *fh, struct vfs *vfsp)
1103 {
1104         rnode4_t *rp;
1105         int index;
1106
1107         index = rtable4hash(fh);
1108         rw_enter(&rtable4[index].r_lock, RW_READER);
1109         rp = r4find(&rtable4[index], fh, vfsp);
1110         rw_exit(&rtable4[index].r_lock);
1111
1112         return (rp);
1113 }
1114
1115 /*
1116  * Return 1 if there is an active vnode belonging to this vfs in the
1117  * rtable4 cache.
1118  *
1119  * Several of these checks are done without holding the usual
1120  * locks.  This is safe because destroy_rtable4(), rp4_addfree(),
1121  * etc. will redo the necessary checks before actually destroying
1122  * any rnodes.
1123  */
1124 int
1125 check_rtable4(struct vfs *vfsp)
1126 {
1127         rnode4_t *rp;
1128         vnode_t *vp;
1129         mntinfo4_t *mi;
1130
1131         ASSERT(vfsp != NULL);
1132         mi = VFTOMI4(vfsp);
1133
1134         mutex_enter(&mi->mi_rnodes_lock);
1135         for (rp = list_head(&mi->mi_rnodes); rp != NULL;
1136             rp = list_next(&mi->mi_rnodes, rp)) {
1137                 vp = RTOV4(rp);
1138
1139                 if (rp->r_freef == NULL ||
1140                     (nfs4_has_pages(vp) && (rp->r_flags & R4DIRTY)) ||
1141                     rp->r_count > 0) {
1142                         mutex_exit(&mi->mi_rnodes_lock);
1143                         return (1);
1144                 }
1145         }
1146         mutex_exit(&mi->mi_rnodes_lock);
1147
1148         return (0);
1149 }
1150
1151 /*
1152  * Destroy inactive vnodes from the hash queues which
1153  * belong to this vfs. All of the vnodes should be inactive.
1154  * It is essential that we destroy all rnodes in case of
1155  * forced unmount as well as in normal unmount case.
1156  */
1157
1158 void
1159 destroy_rtable4(struct vfs *vfsp, cred_t *cr)
1160 {
1161         rnode4_t *rp;
1162         mntinfo4_t *mi;
1163
1164         ASSERT(vfsp != NULL);
1165
1166         mi = VFTOMI4(vfsp);
1167
1168         mutex_enter(&rp4freelist_lock);
1169         mutex_enter(&mi->mi_rnodes_lock);
1170         while ((rp = list_remove_head(&mi->mi_rnodes)) != NULL) {
1171                 /*
1172                  * If the rnode is no longer on the freelist it is not
1173                  * ours and it will be handled by some other thread, so
1174                  * skip it.
1175                  */
1176                 if (rp->r_freef == NULL)
1177                         continue;
1178                 mutex_exit(&mi->mi_rnodes_lock);
1179
1180                 rp4_rmfree(rp);
1181                 mutex_exit(&rp4freelist_lock);
1182
1183                 rp4_rmhash(rp);
1184
1185                 /*
1186                  * This call to rp4_addfree will end up destroying the
1187                  * rnode, but in a safe way with the appropriate set
1188                  * of checks done.
1189                  */
1190                 rp4_addfree(rp, cr);
1191
1192                 mutex_enter(&rp4freelist_lock);
1193                 mutex_enter(&mi->mi_rnodes_lock);
1194         }
1195         mutex_exit(&mi->mi_rnodes_lock);
1196         mutex_exit(&rp4freelist_lock);
1197 }
1198
1199 /*
1200  * This routine destroys all the resources of an rnode
1201  * and finally the rnode itself.
1202  */
1203 static void
1204 destroy_rnode4(rnode4_t *rp)
1205 {
1206         vnode_t *vp;
1207         vfs_t *vfsp;
1208
1209         ASSERT(rp->r_deleg_type == OPEN_DELEGATE_NONE);
1210
1211         vp = RTOV4(rp);
1212         vfsp = vp->v_vfsp;
1213
1214         uninit_rnode4(rp);
1215         atomic_dec_ulong((ulong_t *)&rnode4_new);
1216 #ifdef DEBUG
1217         clstat4_debug.nrnode.value.ui64--;
1218 #endif
1219         kmem_cache_free(rnode4_cache, rp);
1220         vn_invalid(vp);
1221         vn_free(vp);
1222         VFS_RELE(vfsp);
1223 }
1224
1225 /*
1226  * Invalidate the attributes on all rnodes forcing the next getattr
1227  * to go over the wire.  Used to flush stale uid and gid mappings.
1228  * Maybe done on a per vfsp, or all rnodes (vfsp == NULL)
1229  */
1230 void
1231 nfs4_rnode_invalidate(struct vfs *vfsp)
1232 {
1233         int index;
1234         rnode4_t *rp;
1235         vnode_t *vp;
1236
1237         /*
1238          * Walk the hash queues looking for rnodes.
1239          */
1240         for (index = 0; index < rtable4size; index++) {
1241                 rw_enter(&rtable4[index].r_lock, RW_READER);
1242                 for (rp = rtable4[index].r_hashf;
1243                     rp != (rnode4_t *)(&rtable4[index]);
1244                     rp = rp->r_hashf) {
1245                         vp = RTOV4(rp);
1246                         if (vfsp != NULL && vp->v_vfsp != vfsp)
1247                                 continue;
1248
1249                         if (!mutex_tryenter(&rp->r_statelock))
1250                                 continue;
1251
1252                         /*
1253                          * Expire the attributes by resetting the change
1254                          * and attr timeout.
1255                          */
1256                         rp->r_change = 0;
1257                         PURGE_ATTRCACHE4_LOCKED(rp);
1258                         mutex_exit(&rp->r_statelock);
1259                 }
1260                 rw_exit(&rtable4[index].r_lock);
1261         }
1262 }
1263
1264 /*
1265  * Flush all vnodes in this (or every) vfs.
1266  * Used by nfs_sync and by nfs_unmount.
1267  */
1268 void
1269 r4flush(struct vfs *vfsp, cred_t *cr)
1270 {
1271         int index;
1272         rnode4_t *rp;
1273         vnode_t *vp, **vplist;
1274         long num, cnt;
1275
1276         /*
1277          * Check to see whether there is anything to do.
1278          */
1279         num = rnode4_new;
1280         if (num == 0)
1281                 return;
1282
1283         /*
1284          * Allocate a slot for all currently active rnodes on the
1285          * supposition that they all may need flushing.
1286          */
1287         vplist = kmem_alloc(num * sizeof (*vplist), KM_SLEEP);
1288         cnt = 0;
1289
1290         /*
1291          * If the vfs is known we can do fast path by iterating all rnodes that
1292          * belongs to this vfs.  This is much faster than the traditional way
1293          * of iterating rtable4 (below) in a case there is a lot of rnodes that
1294          * does not belong to our vfs.
1295          */
1296         if (vfsp != NULL) {
1297                 mntinfo4_t *mi = VFTOMI4(vfsp);
1298
1299                 mutex_enter(&mi->mi_rnodes_lock);
1300                 for (rp = list_head(&mi->mi_rnodes); rp != NULL;
1301                     rp = list_next(&mi->mi_rnodes, rp)) {
1302                         vp = RTOV4(rp);
1303                         /*
1304                          * Don't bother sync'ing a vp if it
1305                          * is part of virtual swap device or
1306                          * if VFS is read-only
1307                          */
1308                         if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1309                                 continue;
1310                         /*
1311                          * If the vnode has pages and is marked as either dirty
1312                          * or mmap'd, hold and add this vnode to the list of
1313                          * vnodes to flush.
1314                          */
1315                         ASSERT(vp->v_vfsp == vfsp);
1316                         if (nfs4_has_pages(vp) &&
1317                             ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1318                                 VN_HOLD(vp);
1319                                 vplist[cnt++] = vp;
1320                                 if (cnt == num) {
1321                                         /*
1322                                          * The vplist is full because there is
1323                                          * too many rnodes.  We are done for
1324                                          * now.
1325                                          */
1326                                         break;
1327                                 }
1328                         }
1329                 }
1330                 mutex_exit(&mi->mi_rnodes_lock);
1331
1332                 goto done;
1333         }
1334
1335         ASSERT(vfsp == NULL);
1336
1337         /*
1338          * Walk the hash queues looking for rnodes with page
1339          * lists associated with them.  Make a list of these
1340          * files.
1341          */
1342         for (index = 0; index < rtable4size; index++) {
1343                 rw_enter(&rtable4[index].r_lock, RW_READER);
1344                 for (rp = rtable4[index].r_hashf;
1345                     rp != (rnode4_t *)(&rtable4[index]);
1346                     rp = rp->r_hashf) {
1347                         vp = RTOV4(rp);
1348                         /*
1349                          * Don't bother sync'ing a vp if it
1350                          * is part of virtual swap device or
1351                          * if VFS is read-only
1352                          */
1353                         if (IS_SWAPVP(vp) || vn_is_readonly(vp))
1354                                 continue;
1355                         /*
1356                          * If the vnode has pages and is marked as either dirty
1357                          * or mmap'd, hold and add this vnode to the list of
1358                          * vnodes to flush.
1359                          */
1360                         if (nfs4_has_pages(vp) &&
1361                             ((rp->r_flags & R4DIRTY) || rp->r_mapcnt > 0)) {
1362                                 VN_HOLD(vp);
1363                                 vplist[cnt++] = vp;
1364                                 if (cnt == num) {
1365                                         rw_exit(&rtable4[index].r_lock);
1366                                         /*
1367                                          * The vplist is full because there is
1368                                          * too many rnodes.  We are done for
1369                                          * now.
1370                                          */
1371                                         goto done;
1372                                 }
1373                         }
1374                 }
1375                 rw_exit(&rtable4[index].r_lock);
1376         }
1377
1378 done:
1379
1380         /*
1381          * Flush and release all of the files on the list.
1382          */
1383         while (cnt-- > 0) {
1384                 vp = vplist[cnt];
1385                 (void) fop_putpage(vp, 0, 0, B_ASYNC, cr, NULL);
1386                 VN_RELE(vp);
1387         }
1388
1389         /*
1390          * Free the space allocated to hold the list.
1391          */
1392         kmem_free(vplist, num * sizeof (*vplist));
1393 }
1394
1395 int
1396 nfs4_free_data_reclaim(rnode4_t *rp)
1397 {
1398         char *contents;
1399         vnode_t *xattr;
1400         int size;
1401         vsecattr_t *vsp;
1402         int freed;
1403         bool_t rdc = FALSE;
1404
1405         /*
1406          * Free any held caches which may
1407          * be associated with this rnode.
1408          */
1409         mutex_enter(&rp->r_statelock);
1410         if (rp->r_dir != NULL)
1411                 rdc = TRUE;
1412         contents = rp->r_symlink.contents;
1413         size = rp->r_symlink.size;
1414         rp->r_symlink.contents = NULL;
1415         vsp = rp->r_secattr;
1416         rp->r_secattr = NULL;
1417         xattr = rp->r_xattr_dir;
1418         rp->r_xattr_dir = NULL;
1419         mutex_exit(&rp->r_statelock);
1420
1421         /*
1422          * Free the access cache entries.
1423          */
1424         freed = nfs4_access_purge_rp(rp);
1425
1426         if (rdc == FALSE && contents == NULL && vsp == NULL && xattr == NULL)
1427                 return (freed);
1428
1429         /*
1430          * Free the readdir cache entries, incompletely if we can't block.
1431          */
1432         nfs4_purge_rddir_cache(RTOV4(rp));
1433
1434         /*
1435          * Free the symbolic link cache.
1436          */
1437         if (contents != NULL) {
1438
1439                 kmem_free((void *)contents, size);
1440         }
1441
1442         /*
1443          * Free any cached ACL.
1444          */
1445         if (vsp != NULL)
1446                 nfs4_acl_free_cache(vsp);
1447
1448         /*
1449          * Release the xattr directory vnode
1450          */
1451         if (xattr != NULL)
1452                 VN_RELE(xattr);
1453
1454         return (1);
1455 }
1456
1457 static int
1458 nfs4_active_data_reclaim(rnode4_t *rp)
1459 {
1460         char *contents;
1461         vnode_t *xattr = NULL;
1462         int size;
1463         vsecattr_t *vsp;
1464         int freed;
1465         bool_t rdc = FALSE;
1466
1467         /*
1468          * Free any held credentials and caches which
1469          * may be associated with this rnode.
1470          */
1471         if (!mutex_tryenter(&rp->r_statelock))
1472                 return (0);
1473         contents = rp->r_symlink.contents;
1474         size = rp->r_symlink.size;
1475         rp->r_symlink.contents = NULL;
1476         vsp = rp->r_secattr;
1477         rp->r_secattr = NULL;
1478         if (rp->r_dir != NULL)
1479                 rdc = TRUE;
1480         /*
1481          * To avoid a deadlock, do not free r_xattr_dir cache if it is hashed
1482          * on the same r_hashq queue. We are not mandated to free all caches.
1483          * VN_RELE(rp->r_xattr_dir) will be done sometime later - e.g. when the
1484          * rnode 'rp' is freed or put on the free list.
1485          *
1486          * We will retain NFS4_XATTR_DIR_NOTSUPP because:
1487          * - it has no associated rnode4_t (its v_data is NULL),
1488          * - it is preallocated statically and will never go away,
1489          * so we cannot save anything by releasing it.
1490          */
1491         if (rp->r_xattr_dir && rp->r_xattr_dir != NFS4_XATTR_DIR_NOTSUPP &&
1492             VTOR4(rp->r_xattr_dir)->r_hashq != rp->r_hashq) {
1493                 xattr = rp->r_xattr_dir;
1494                 rp->r_xattr_dir = NULL;
1495         }
1496         mutex_exit(&rp->r_statelock);
1497
1498         /*
1499          * Free the access cache entries.
1500          */
1501         freed = nfs4_access_purge_rp(rp);
1502
1503         if (contents == NULL && vsp == NULL && rdc == FALSE && xattr == NULL)
1504                 return (freed);
1505
1506         /*
1507          * Free the symbolic link cache.
1508          */
1509         if (contents != NULL) {
1510
1511                 kmem_free((void *)contents, size);
1512         }
1513
1514         /*
1515          * Free any cached ACL.
1516          */
1517         if (vsp != NULL)
1518                 nfs4_acl_free_cache(vsp);
1519
1520         nfs4_purge_rddir_cache(RTOV4(rp));
1521
1522         /*
1523          * Release the xattr directory vnode
1524          */
1525         if (xattr != NULL)
1526                 VN_RELE(xattr);
1527
1528         return (1);
1529 }
1530
1531 static int
1532 nfs4_free_reclaim(void)
1533 {
1534         int freed;
1535         rnode4_t *rp;
1536
1537 #ifdef DEBUG
1538         clstat4_debug.f_reclaim.value.ui64++;
1539 #endif
1540         freed = 0;
1541         mutex_enter(&rp4freelist_lock);
1542         rp = rp4freelist;
1543         if (rp != NULL) {
1544                 do {
1545                         if (nfs4_free_data_reclaim(rp))
1546                                 freed = 1;
1547                 } while ((rp = rp->r_freef) != rp4freelist);
1548         }
1549         mutex_exit(&rp4freelist_lock);
1550         return (freed);
1551 }
1552
1553 static int
1554 nfs4_active_reclaim(void)
1555 {
1556         int freed;
1557         int index;
1558         rnode4_t *rp;
1559
1560 #ifdef DEBUG
1561         clstat4_debug.a_reclaim.value.ui64++;
1562 #endif
1563         freed = 0;
1564         for (index = 0; index < rtable4size; index++) {
1565                 rw_enter(&rtable4[index].r_lock, RW_READER);
1566                 for (rp = rtable4[index].r_hashf;
1567                     rp != (rnode4_t *)(&rtable4[index]);
1568                     rp = rp->r_hashf) {
1569                         if (nfs4_active_data_reclaim(rp))
1570                                 freed = 1;
1571                 }
1572                 rw_exit(&rtable4[index].r_lock);
1573         }
1574         return (freed);
1575 }
1576
1577 static int
1578 nfs4_rnode_reclaim(void)
1579 {
1580         int freed;
1581         rnode4_t *rp;
1582         vnode_t *vp;
1583
1584 #ifdef DEBUG
1585         clstat4_debug.r_reclaim.value.ui64++;
1586 #endif
1587         freed = 0;
1588         mutex_enter(&rp4freelist_lock);
1589         while ((rp = rp4freelist) != NULL) {
1590                 rp4_rmfree(rp);
1591                 mutex_exit(&rp4freelist_lock);
1592                 if (rp->r_flags & R4HASHED) {
1593                         vp = RTOV4(rp);
1594                         rw_enter(&rp->r_hashq->r_lock, RW_WRITER);
1595                         mutex_enter(&vp->v_lock);
1596                         if (vp->v_count > 1) {
1597                                 VN_RELE_LOCKED(vp);
1598                                 mutex_exit(&vp->v_lock);
1599                                 rw_exit(&rp->r_hashq->r_lock);
1600                                 mutex_enter(&rp4freelist_lock);
1601                                 continue;
1602                         }
1603                         mutex_exit(&vp->v_lock);
1604                         rp4_rmhash_locked(rp);
1605                         rw_exit(&rp->r_hashq->r_lock);
1606                 }
1607                 /*
1608                  * This call to rp_addfree will end up destroying the
1609                  * rnode, but in a safe way with the appropriate set
1610                  * of checks done.
1611                  */
1612                 rp4_addfree(rp, CRED());
1613                 mutex_enter(&rp4freelist_lock);
1614         }
1615         mutex_exit(&rp4freelist_lock);
1616         return (freed);
1617 }
1618
1619 /*ARGSUSED*/
1620 static void
1621 nfs4_reclaim(void *cdrarg)
1622 {
1623 #ifdef DEBUG
1624         clstat4_debug.reclaim.value.ui64++;
1625 #endif
1626         if (nfs4_free_reclaim())
1627                 return;
1628
1629         if (nfs4_active_reclaim())
1630                 return;
1631
1632         (void) nfs4_rnode_reclaim();
1633 }
1634
1635 /*
1636  * Returns the clientid4 to use for the given mntinfo4.  Note that the
1637  * clientid can change if the caller drops mi_recovlock.
1638  */
1639
1640 clientid4
1641 mi2clientid(mntinfo4_t *mi)
1642 {
1643         nfs4_server_t   *sp;
1644         clientid4       clientid = 0;
1645
1646         /* this locks down sp if it is found */
1647         sp = find_nfs4_server(mi);
1648         if (sp != NULL) {
1649                 clientid = sp->clientid;
1650                 mutex_exit(&sp->s_lock);
1651                 nfs4_server_rele(sp);
1652         }
1653         return (clientid);
1654 }
1655
1656 /*
1657  * Return the current lease time for the server associated with the given
1658  * file.  Note that the lease time could change immediately after this
1659  * call.
1660  */
1661
1662 time_t
1663 r2lease_time(rnode4_t *rp)
1664 {
1665         nfs4_server_t   *sp;
1666         time_t          lease_time;
1667         mntinfo4_t      *mi = VTOMI4(RTOV4(rp));
1668
1669         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
1670
1671         /* this locks down sp if it is found */
1672         sp = find_nfs4_server(VTOMI4(RTOV4(rp)));
1673
1674         if (VTOMI4(RTOV4(rp))->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1675                 if (sp != NULL) {
1676                         mutex_exit(&sp->s_lock);
1677                         nfs4_server_rele(sp);
1678                 }
1679                 nfs_rw_exit(&mi->mi_recovlock);
1680                 return (1);             /* 1 second */
1681         }
1682
1683         ASSERT(sp != NULL);
1684
1685         lease_time = sp->s_lease_time;
1686
1687         mutex_exit(&sp->s_lock);
1688         nfs4_server_rele(sp);
1689         nfs_rw_exit(&mi->mi_recovlock);
1690
1691         return (lease_time);
1692 }
1693
1694 /*
1695  * Return a list with information about all the known open instances for
1696  * a filesystem. The caller must call r4releopenlist() when done with the
1697  * list.
1698  *
1699  * We are safe at looking at os_valid and os_pending_close across dropping
1700  * the 'os_sync_lock' to count up the number of open streams and then
1701  * allocate memory for the osp list due to:
1702  *      -Looking at os_pending_close is safe since this routine is
1703  *      only called via recovery, and os_pending_close can only be set via
1704  *      a non-recovery operation (which are all blocked when recovery
1705  *      is active).
1706  *
1707  *      -Examining os_valid is safe since non-recovery operations, which
1708  *      could potentially switch os_valid to 0, are blocked (via
1709  *      nfs4_start_fop) and recovery is single-threaded per mntinfo4_t
1710  *      (which means we are the only recovery thread potentially acting
1711  *      on this open stream).
1712  */
1713
1714 nfs4_opinst_t *
1715 r4mkopenlist(mntinfo4_t *mi)
1716 {
1717         nfs4_opinst_t *reopenlist, *rep;
1718         rnode4_t *rp;
1719         vnode_t *vp;
1720         vfs_t *vfsp = mi->mi_vfsp;
1721         int numosp;
1722         nfs4_open_stream_t *osp;
1723         int index;
1724         open_delegation_type4 dtype;
1725         int hold_vnode;
1726
1727         reopenlist = NULL;
1728
1729         for (index = 0; index < rtable4size; index++) {
1730                 rw_enter(&rtable4[index].r_lock, RW_READER);
1731                 for (rp = rtable4[index].r_hashf;
1732                     rp != (rnode4_t *)(&rtable4[index]);
1733                     rp = rp->r_hashf) {
1734
1735                         vp = RTOV4(rp);
1736                         if (vp->v_vfsp != vfsp)
1737                                 continue;
1738                         hold_vnode = 0;
1739
1740                         mutex_enter(&rp->r_os_lock);
1741
1742                         /* Count the number of valid open_streams of the file */
1743                         numosp = 0;
1744                         for (osp = list_head(&rp->r_open_streams); osp != NULL;
1745                             osp = list_next(&rp->r_open_streams, osp)) {
1746                                 mutex_enter(&osp->os_sync_lock);
1747                                 if (osp->os_valid && !osp->os_pending_close)
1748                                         numosp++;
1749                                 mutex_exit(&osp->os_sync_lock);
1750                         }
1751
1752                         /* Fill in the valid open streams per vp */
1753                         if (numosp > 0) {
1754                                 int j;
1755
1756                                 hold_vnode = 1;
1757
1758                                 /*
1759                                  * Add a new open instance to the list
1760                                  */
1761                                 rep = kmem_zalloc(sizeof (*reopenlist),
1762                                     KM_SLEEP);
1763                                 rep->re_next = reopenlist;
1764                                 reopenlist = rep;
1765
1766                                 rep->re_vp = vp;
1767                                 rep->re_osp = kmem_zalloc(
1768                                     numosp * sizeof (*(rep->re_osp)),
1769                                     KM_SLEEP);
1770                                 rep->re_numosp = numosp;
1771
1772                                 j = 0;
1773                                 for (osp = list_head(&rp->r_open_streams);
1774                                     osp != NULL;
1775                                     osp = list_next(&rp->r_open_streams, osp)) {
1776
1777                                         mutex_enter(&osp->os_sync_lock);
1778                                         if (osp->os_valid &&
1779                                             !osp->os_pending_close) {
1780                                                 osp->os_ref_count++;
1781                                                 rep->re_osp[j] = osp;
1782                                                 j++;
1783                                         }
1784                                         mutex_exit(&osp->os_sync_lock);
1785                                 }
1786                                 /*
1787                                  * Assuming valid osp(s) stays valid between
1788                                  * the time obtaining j and numosp.
1789                                  */
1790                                 ASSERT(j == numosp);
1791                         }
1792
1793                         mutex_exit(&rp->r_os_lock);
1794                         /* do this here to keep v_lock > r_os_lock */
1795                         if (hold_vnode)
1796                                 VN_HOLD(vp);
1797                         mutex_enter(&rp->r_statev4_lock);
1798                         if (rp->r_deleg_type != OPEN_DELEGATE_NONE) {
1799                                 /*
1800                                  * If this rnode holds a delegation,
1801                                  * but if there are no valid open streams,
1802                                  * then just discard the delegation
1803                                  * without doing delegreturn.
1804                                  */
1805                                 if (numosp > 0)
1806                                         rp->r_deleg_needs_recovery =
1807                                             rp->r_deleg_type;
1808                         }
1809                         /* Save the delegation type for use outside the lock */
1810                         dtype = rp->r_deleg_type;
1811                         mutex_exit(&rp->r_statev4_lock);
1812
1813                         /*
1814                          * If we have a delegation then get rid of it.
1815                          * We've set rp->r_deleg_needs_recovery so we have
1816                          * enough information to recover.
1817                          */
1818                         if (dtype != OPEN_DELEGATE_NONE) {
1819                                 (void) nfs4delegreturn(rp, NFS4_DR_DISCARD);
1820                         }
1821                 }
1822                 rw_exit(&rtable4[index].r_lock);
1823         }
1824         return (reopenlist);
1825 }
1826
1827 /*
1828  * Given a filesystem id, check to see if any rnodes
1829  * within this fsid reside in the rnode cache, other
1830  * than one we know about.
1831  *
1832  * Return 1 if an rnode is found, 0 otherwise
1833  */
1834 int
1835 r4find_by_fsid(mntinfo4_t *mi, fattr4_fsid *moved_fsid)
1836 {
1837         rnode4_t *rp;
1838         vnode_t *vp;
1839         vfs_t *vfsp = mi->mi_vfsp;
1840         fattr4_fsid *fsid;
1841         int index, found = 0;
1842
1843         for (index = 0; index < rtable4size; index++) {
1844                 rw_enter(&rtable4[index].r_lock, RW_READER);
1845                 for (rp = rtable4[index].r_hashf;
1846                     rp != (rnode4_t *)(&rtable4[index]);
1847                     rp = rp->r_hashf) {
1848
1849                         vp = RTOV4(rp);
1850                         if (vp->v_vfsp != vfsp)
1851                                 continue;
1852
1853                         /*
1854                          * XXX there might be a case where a
1855                          * replicated fs may have the same fsid
1856                          * across two different servers. This
1857                          * check isn't good enough in that case
1858                          */
1859                         fsid = &rp->r_srv_fsid;
1860                         if (FATTR4_FSID_EQ(moved_fsid, fsid)) {
1861                                 found = 1;
1862                                 break;
1863                         }
1864                 }
1865                 rw_exit(&rtable4[index].r_lock);
1866
1867                 if (found)
1868                         break;
1869         }
1870         return (found);
1871 }
1872
1873 /*
1874  * Release the list of open instance references.
1875  */
1876
1877 void
1878 r4releopenlist(nfs4_opinst_t *reopenp)
1879 {
1880         nfs4_opinst_t *rep, *next;
1881         int i;
1882
1883         for (rep = reopenp; rep; rep = next) {
1884                 next = rep->re_next;
1885
1886                 for (i = 0; i < rep->re_numosp; i++)
1887                         open_stream_rele(rep->re_osp[i], VTOR4(rep->re_vp));
1888
1889                 VN_RELE(rep->re_vp);
1890                 kmem_free(rep->re_osp,
1891                     rep->re_numosp * sizeof (*(rep->re_osp)));
1892
1893                 kmem_free(rep, sizeof (*rep));
1894         }
1895 }
1896
1897 int
1898 nfs4_rnode_init(void)
1899 {
1900         ulong_t nrnode4_max;
1901         int i;
1902
1903         /*
1904          * Compute the size of the rnode4 hash table
1905          */
1906         if (nrnode <= 0)
1907                 nrnode = ncsize;
1908         nrnode4_max =
1909             (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct rnode4));
1910         if (nrnode > nrnode4_max || (nrnode == 0 && ncsize == 0)) {
1911                 zcmn_err(GLOBAL_ZONEID, CE_NOTE,
1912                     "!setting nrnode to max value of %ld", nrnode4_max);
1913                 nrnode = nrnode4_max;
1914         }
1915         rtable4size = 1 << highbit(nrnode / rnode4_hashlen);
1916         rtable4mask = rtable4size - 1;
1917
1918         /*
1919          * Allocate and initialize the hash buckets
1920          */
1921         rtable4 = kmem_alloc(rtable4size * sizeof (*rtable4), KM_SLEEP);
1922         for (i = 0; i < rtable4size; i++) {
1923                 rtable4[i].r_hashf = (rnode4_t *)(&rtable4[i]);
1924                 rtable4[i].r_hashb = (rnode4_t *)(&rtable4[i]);
1925                 rw_init(&rtable4[i].r_lock, NULL, RW_DEFAULT, NULL);
1926         }
1927
1928         rnode4_cache = kmem_cache_create("rnode4_cache", sizeof (rnode4_t),
1929             0, NULL, NULL, nfs4_reclaim, NULL, NULL, 0);
1930
1931         return (0);
1932 }
1933
1934 int
1935 nfs4_rnode_fini(void)
1936 {
1937         int i;
1938
1939         /*
1940          * Deallocate the rnode hash queues
1941          */
1942         kmem_cache_destroy(rnode4_cache);
1943
1944         for (i = 0; i < rtable4size; i++)
1945                 rw_destroy(&rtable4[i].r_lock);
1946
1947         kmem_free(rtable4, rtable4size * sizeof (*rtable4));
1948
1949         return (0);
1950 }
1951
1952 /*
1953  * Return non-zero if the given filehandle refers to the root filehandle
1954  * for the given rnode.
1955  */
1956
1957 static int
1958 isrootfh(nfs4_sharedfh_t *fh, rnode4_t *rp)
1959 {
1960         int isroot;
1961
1962         isroot = 0;
1963         if (SFH4_SAME(VTOMI4(RTOV4(rp))->mi_rootfh, fh))
1964                 isroot = 1;
1965
1966         return (isroot);
1967 }
1968
1969 /*
1970  * The r4_stub_* routines assume that the rnode is newly activated, and
1971  * that the caller either holds the hash bucket r_lock for this rnode as
1972  * RW_WRITER, or holds r_statelock.
1973  */
1974 static void
1975 r4_stub_set(rnode4_t *rp, nfs4_stub_type_t type)
1976 {
1977         vnode_t *vp = RTOV4(rp);
1978         krwlock_t *hash_lock = &rp->r_hashq->r_lock;
1979
1980         ASSERT(RW_WRITE_HELD(hash_lock) || MUTEX_HELD(&rp->r_statelock));
1981
1982         rp->r_stub_type = type;
1983
1984         /*
1985          * Safely switch this vnode to the trigger vnodeops.
1986          *
1987          * Currently, we don't ever switch a trigger vnode back to using
1988          * "regular" v4 vnodeops. NFS4_STUB_NONE is only used to note that
1989          * a new v4 object is not a trigger, and it will already have the
1990          * correct v4 vnodeops by default. So, no "else" case required here.
1991          */
1992         if (type != NFS4_STUB_NONE)
1993                 vn_setops(vp, &nfs4_trigger_vnodeops);
1994 }
1995
1996 void
1997 r4_stub_mirrormount(rnode4_t *rp)
1998 {
1999         r4_stub_set(rp, NFS4_STUB_MIRRORMOUNT);
2000 }
2001
2002 void
2003 r4_stub_referral(rnode4_t *rp)
2004 {
2005         DTRACE_PROBE1(nfs4clnt__func__referral__moved,
2006             vnode_t *, RTOV4(rp));
2007         r4_stub_set(rp, NFS4_STUB_REFERRAL);
2008 }
2009
2010 void
2011 r4_stub_none(rnode4_t *rp)
2012 {
2013         r4_stub_set(rp, NFS4_STUB_NONE);
2014 }
2015
2016 #ifdef DEBUG
2017
2018 /*
2019  * Look in the rnode table for other rnodes that have the same filehandle.
2020  * Assume the lock is held for the hash chain of checkrp
2021  */
2022
2023 static void
2024 r4_dup_check(rnode4_t *checkrp, vfs_t *vfsp)
2025 {
2026         rnode4_t *rp;
2027         vnode_t *tvp;
2028         nfs4_fhandle_t fh, fh2;
2029         int index;
2030
2031         if (!r4_check_for_dups)
2032                 return;
2033
2034         ASSERT(RW_LOCK_HELD(&checkrp->r_hashq->r_lock));
2035
2036         sfh4_copyval(checkrp->r_fh, &fh);
2037
2038         for (index = 0; index < rtable4size; index++) {
2039
2040                 if (&rtable4[index] != checkrp->r_hashq)
2041                         rw_enter(&rtable4[index].r_lock, RW_READER);
2042
2043                 for (rp = rtable4[index].r_hashf;
2044                     rp != (rnode4_t *)(&rtable4[index]);
2045                     rp = rp->r_hashf) {
2046
2047                         if (rp == checkrp)
2048                                 continue;
2049
2050                         tvp = RTOV4(rp);
2051                         if (tvp->v_vfsp != vfsp)
2052                                 continue;
2053
2054                         sfh4_copyval(rp->r_fh, &fh2);
2055                         if (nfs4cmpfhandle(&fh, &fh2) == 0) {
2056                                 cmn_err(CE_PANIC, "rnodes with same fs, fh "
2057                                     "(%p, %p)", (void *)checkrp, (void *)rp);
2058                         }
2059                 }
2060
2061                 if (&rtable4[index] != checkrp->r_hashq)
2062                         rw_exit(&rtable4[index].r_lock);
2063         }
2064 }
2065
2066 #endif /* DEBUG */