su.static: link with proto area libs (esp. libc)
[unleashed.git] / kernel / fs / nfs / nfs4_state.c
blob1483bce9ae68e9155a90fc60a1d134486f892c78
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
26 #include <sys/systm.h>
27 #include <sys/kmem.h>
28 #include <sys/cmn_err.h>
29 #include <sys/atomic.h>
30 #include <sys/flock.h>
31 #include <nfs/export.h>
32 #include <nfs/nfs.h>
33 #include <nfs/nfs4.h>
34 #include <nfs/nfssys.h>
35 #include <nfs/lm.h>
36 #include <sys/pathname.h>
37 #include <sys/sdt.h>
38 #include <sys/nvpair.h>
40 extern u_longlong_t nfs4_srv_caller_id;
42 extern time_t rfs4_start_time;
43 extern uint_t nfs4_srv_vkey;
45 stateid4 special0 = {
47 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
50 stateid4 special1 = {
51 0xffffffff,
53 (char)0xff, (char)0xff, (char)0xff, (char)0xff,
54 (char)0xff, (char)0xff, (char)0xff, (char)0xff,
55 (char)0xff, (char)0xff, (char)0xff, (char)0xff
60 #define ISSPECIAL(id) (stateid4_cmp(id, &special0) || \
61 stateid4_cmp(id, &special1))
63 /* For embedding the cluster nodeid into our clientid */
64 #define CLUSTER_NODEID_SHIFT 24
65 #define CLUSTER_MAX_NODEID 255
67 #ifdef DEBUG
68 int rfs4_debug;
69 #endif
71 static uint32_t rfs4_database_debug = 0x00;
73 static void rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf);
74 static void rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dir, char *leaf);
75 static void rfs4_dss_clear_oldstate(rfs4_servinst_t *sip);
76 static void rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip);
79 * Couple of simple init/destroy functions for a general waiter
81 void
82 rfs4_sw_init(rfs4_state_wait_t *swp)
84 mutex_init(swp->sw_cv_lock, NULL, MUTEX_DEFAULT, NULL);
85 cv_init(swp->sw_cv, NULL, CV_DEFAULT, NULL);
86 swp->sw_active = FALSE;
87 swp->sw_wait_count = 0;
90 void
91 rfs4_sw_destroy(rfs4_state_wait_t *swp)
93 mutex_destroy(swp->sw_cv_lock);
94 cv_destroy(swp->sw_cv);
97 void
98 rfs4_sw_enter(rfs4_state_wait_t *swp)
100 mutex_enter(swp->sw_cv_lock);
101 while (swp->sw_active) {
102 swp->sw_wait_count++;
103 cv_wait(swp->sw_cv, swp->sw_cv_lock);
104 swp->sw_wait_count--;
106 ASSERT(swp->sw_active == FALSE);
107 swp->sw_active = TRUE;
108 mutex_exit(swp->sw_cv_lock);
111 void
112 rfs4_sw_exit(rfs4_state_wait_t *swp)
114 mutex_enter(swp->sw_cv_lock);
115 ASSERT(swp->sw_active == TRUE);
116 swp->sw_active = FALSE;
117 if (swp->sw_wait_count != 0)
118 cv_broadcast(swp->sw_cv);
119 mutex_exit(swp->sw_cv_lock);
123 * CPR callback id -- not related to v4 callbacks
125 static callb_id_t cpr_id = 0;
127 static void
128 deep_lock_copy(LOCK4res *dres, LOCK4res *sres)
130 lock_owner4 *slo = &sres->LOCK4res_u.denied.owner;
131 lock_owner4 *dlo = &dres->LOCK4res_u.denied.owner;
133 if (sres->status == NFS4ERR_DENIED) {
134 dlo->owner_val = kmem_alloc(slo->owner_len, KM_SLEEP);
135 bcopy(slo->owner_val, dlo->owner_val, slo->owner_len);
139 static void
140 deep_lock_free(LOCK4res *res)
142 lock_owner4 *lo = &res->LOCK4res_u.denied.owner;
144 if (res->status == NFS4ERR_DENIED)
145 kmem_free(lo->owner_val, lo->owner_len);
148 static void
149 deep_open_copy(OPEN4res *dres, OPEN4res *sres)
151 nfsace4 *sacep, *dacep;
153 if (sres->status != NFS4_OK) {
154 return;
157 dres->attrset = sres->attrset;
159 switch (sres->delegation.delegation_type) {
160 case OPEN_DELEGATE_NONE:
161 return;
162 case OPEN_DELEGATE_READ:
163 sacep = &sres->delegation.open_delegation4_u.read.permissions;
164 dacep = &dres->delegation.open_delegation4_u.read.permissions;
165 break;
166 case OPEN_DELEGATE_WRITE:
167 sacep = &sres->delegation.open_delegation4_u.write.permissions;
168 dacep = &dres->delegation.open_delegation4_u.write.permissions;
169 break;
171 dacep->who.utf8string_val =
172 kmem_alloc(sacep->who.utf8string_len, KM_SLEEP);
173 bcopy(sacep->who.utf8string_val, dacep->who.utf8string_val,
174 sacep->who.utf8string_len);
177 static void
178 deep_open_free(OPEN4res *res)
180 nfsace4 *acep;
181 if (res->status != NFS4_OK)
182 return;
184 switch (res->delegation.delegation_type) {
185 case OPEN_DELEGATE_NONE:
186 return;
187 case OPEN_DELEGATE_READ:
188 acep = &res->delegation.open_delegation4_u.read.permissions;
189 break;
190 case OPEN_DELEGATE_WRITE:
191 acep = &res->delegation.open_delegation4_u.write.permissions;
192 break;
195 if (acep->who.utf8string_val) {
196 kmem_free(acep->who.utf8string_val, acep->who.utf8string_len);
197 acep->who.utf8string_val = NULL;
201 void
202 rfs4_free_reply(nfs_resop4 *rp)
204 switch (rp->resop) {
205 case OP_LOCK:
206 deep_lock_free(&rp->nfs_resop4_u.oplock);
207 break;
208 case OP_OPEN:
209 deep_open_free(&rp->nfs_resop4_u.opopen);
210 default:
211 break;
215 void
216 rfs4_copy_reply(nfs_resop4 *dst, nfs_resop4 *src)
218 *dst = *src;
220 /* Handle responses that need deep copy */
221 switch (src->resop) {
222 case OP_LOCK:
223 deep_lock_copy(&dst->nfs_resop4_u.oplock,
224 &src->nfs_resop4_u.oplock);
225 break;
226 case OP_OPEN:
227 deep_open_copy(&dst->nfs_resop4_u.opopen,
228 &src->nfs_resop4_u.opopen);
229 break;
230 default:
231 break;
236 * This is the implementation of the underlying state engine. The
237 * public interface to this engine is described by
238 * nfs4_state.h. Callers to the engine should hold no state engine
239 * locks when they call in to it. If the protocol needs to lock data
240 * structures it should do so after acquiring all references to them
241 * first and then follow the following lock order:
243 * client > openowner > state > lo_state > lockowner > file.
245 * Internally we only allow a thread to hold one hash bucket lock at a
246 * time and the lock is higher in the lock order (must be acquired
247 * first) than the data structure that is on that hash list.
249 * If a new reference was acquired by the caller, that reference needs
250 * to be released after releasing all acquired locks with the
251 * corresponding rfs4_*_rele routine.
255 * This code is some what prototypical for now. Its purpose currently is to
256 * implement the interfaces sufficiently to finish the higher protocol
257 * elements. This will be replaced by a dynamically resizeable tables
258 * backed by kmem_cache allocator. However synchronization is handled
259 * correctly (I hope) and will not change by much. The mutexes for
260 * the hash buckets that can be used to create new instances of data
261 * structures might be good candidates to evolve into reader writer
262 * locks. If it has to do a creation, it would be holding the
263 * mutex across a kmem_alloc with KM_SLEEP specified.
266 #ifdef DEBUG
267 #define TABSIZE 17
268 #else
269 #define TABSIZE 2047
270 #endif
272 #define ADDRHASH(key) ((unsigned long)(key) >> 3)
274 /* Used to serialize create/destroy of rfs4_server_state database */
275 kmutex_t rfs4_state_lock;
276 static rfs4_database_t *rfs4_server_state = NULL;
278 /* Used to serialize lookups of clientids */
279 static krwlock_t rfs4_findclient_lock;
282 * For now this "table" is exposed so that the CPR callback
283 * function can tromp through it..
285 rfs4_table_t *rfs4_client_tab;
287 static rfs4_index_t *rfs4_clientid_idx;
288 static rfs4_index_t *rfs4_nfsclnt_idx;
289 static rfs4_table_t *rfs4_clntip_tab;
290 static rfs4_index_t *rfs4_clntip_idx;
291 static rfs4_table_t *rfs4_openowner_tab;
292 static rfs4_index_t *rfs4_openowner_idx;
293 static rfs4_table_t *rfs4_state_tab;
294 static rfs4_index_t *rfs4_state_idx;
295 static rfs4_index_t *rfs4_state_owner_file_idx;
296 static rfs4_index_t *rfs4_state_file_idx;
297 static rfs4_table_t *rfs4_lo_state_tab;
298 static rfs4_index_t *rfs4_lo_state_idx;
299 static rfs4_index_t *rfs4_lo_state_owner_idx;
300 static rfs4_table_t *rfs4_lockowner_tab;
301 static rfs4_index_t *rfs4_lockowner_idx;
302 static rfs4_index_t *rfs4_lockowner_pid_idx;
303 static rfs4_table_t *rfs4_file_tab;
304 static rfs4_index_t *rfs4_file_idx;
305 static rfs4_table_t *rfs4_deleg_state_tab;
306 static rfs4_index_t *rfs4_deleg_idx;
307 static rfs4_index_t *rfs4_deleg_state_idx;
309 #define MAXTABSZ 1024*1024
311 /* The values below are rfs4_lease_time units */
313 #ifdef DEBUG
314 #define CLIENT_CACHE_TIME 1
315 #define OPENOWNER_CACHE_TIME 1
316 #define STATE_CACHE_TIME 1
317 #define LO_STATE_CACHE_TIME 1
318 #define LOCKOWNER_CACHE_TIME 1
319 #define FILE_CACHE_TIME 3
320 #define DELEG_STATE_CACHE_TIME 1
321 #else
322 #define CLIENT_CACHE_TIME 10
323 #define OPENOWNER_CACHE_TIME 5
324 #define STATE_CACHE_TIME 1
325 #define LO_STATE_CACHE_TIME 1
326 #define LOCKOWNER_CACHE_TIME 3
327 #define FILE_CACHE_TIME 40
328 #define DELEG_STATE_CACHE_TIME 1
329 #endif
332 static time_t rfs4_client_cache_time = 0;
333 static time_t rfs4_clntip_cache_time = 0;
334 static time_t rfs4_openowner_cache_time = 0;
335 static time_t rfs4_state_cache_time = 0;
336 static time_t rfs4_lo_state_cache_time = 0;
337 static time_t rfs4_lockowner_cache_time = 0;
338 static time_t rfs4_file_cache_time = 0;
339 static time_t rfs4_deleg_state_cache_time = 0;
341 static bool_t rfs4_client_create(rfs4_entry_t, void *);
342 static void rfs4_dss_remove_cpleaf(rfs4_client_t *);
343 static void rfs4_dss_remove_leaf(rfs4_servinst_t *, char *, char *);
344 static void rfs4_client_destroy(rfs4_entry_t);
345 static bool_t rfs4_client_expiry(rfs4_entry_t);
346 static uint32_t clientid_hash(void *);
347 static bool_t clientid_compare(rfs4_entry_t, void *);
348 static void *clientid_mkkey(rfs4_entry_t);
349 static uint32_t nfsclnt_hash(void *);
350 static bool_t nfsclnt_compare(rfs4_entry_t, void *);
351 static void *nfsclnt_mkkey(rfs4_entry_t);
352 static bool_t rfs4_clntip_expiry(rfs4_entry_t);
353 static void rfs4_clntip_destroy(rfs4_entry_t);
354 static bool_t rfs4_clntip_create(rfs4_entry_t, void *);
355 static uint32_t clntip_hash(void *);
356 static bool_t clntip_compare(rfs4_entry_t, void *);
357 static void *clntip_mkkey(rfs4_entry_t);
358 static bool_t rfs4_openowner_create(rfs4_entry_t, void *);
359 static void rfs4_openowner_destroy(rfs4_entry_t);
360 static bool_t rfs4_openowner_expiry(rfs4_entry_t);
361 static uint32_t openowner_hash(void *);
362 static bool_t openowner_compare(rfs4_entry_t, void *);
363 static void *openowner_mkkey(rfs4_entry_t);
364 static bool_t rfs4_state_create(rfs4_entry_t, void *);
365 static void rfs4_state_destroy(rfs4_entry_t);
366 static bool_t rfs4_state_expiry(rfs4_entry_t);
367 static uint32_t state_hash(void *);
368 static bool_t state_compare(rfs4_entry_t, void *);
369 static void *state_mkkey(rfs4_entry_t);
370 static uint32_t state_owner_file_hash(void *);
371 static bool_t state_owner_file_compare(rfs4_entry_t, void *);
372 static void *state_owner_file_mkkey(rfs4_entry_t);
373 static uint32_t state_file_hash(void *);
374 static bool_t state_file_compare(rfs4_entry_t, void *);
375 static void *state_file_mkkey(rfs4_entry_t);
376 static bool_t rfs4_lo_state_create(rfs4_entry_t, void *);
377 static void rfs4_lo_state_destroy(rfs4_entry_t);
378 static bool_t rfs4_lo_state_expiry(rfs4_entry_t);
379 static uint32_t lo_state_hash(void *);
380 static bool_t lo_state_compare(rfs4_entry_t, void *);
381 static void *lo_state_mkkey(rfs4_entry_t);
382 static uint32_t lo_state_lo_hash(void *);
383 static bool_t lo_state_lo_compare(rfs4_entry_t, void *);
384 static void *lo_state_lo_mkkey(rfs4_entry_t);
385 static bool_t rfs4_lockowner_create(rfs4_entry_t, void *);
386 static void rfs4_lockowner_destroy(rfs4_entry_t);
387 static bool_t rfs4_lockowner_expiry(rfs4_entry_t);
388 static uint32_t lockowner_hash(void *);
389 static bool_t lockowner_compare(rfs4_entry_t, void *);
390 static void *lockowner_mkkey(rfs4_entry_t);
391 static uint32_t pid_hash(void *);
392 static bool_t pid_compare(rfs4_entry_t, void *);
393 static void *pid_mkkey(rfs4_entry_t);
394 static bool_t rfs4_file_create(rfs4_entry_t, void *);
395 static void rfs4_file_destroy(rfs4_entry_t);
396 static uint32_t file_hash(void *);
397 static bool_t file_compare(rfs4_entry_t, void *);
398 static void *file_mkkey(rfs4_entry_t);
399 static bool_t rfs4_deleg_state_create(rfs4_entry_t, void *);
400 static void rfs4_deleg_state_destroy(rfs4_entry_t);
401 static bool_t rfs4_deleg_state_expiry(rfs4_entry_t);
402 static uint32_t deleg_hash(void *);
403 static bool_t deleg_compare(rfs4_entry_t, void *);
404 static void *deleg_mkkey(rfs4_entry_t);
405 static uint32_t deleg_state_hash(void *);
406 static bool_t deleg_state_compare(rfs4_entry_t, void *);
407 static void *deleg_state_mkkey(rfs4_entry_t);
409 static void rfs4_state_rele_nounlock(rfs4_state_t *);
411 static int rfs4_ss_enabled = 0;
413 extern void (*rfs4_client_clrst)(struct nfs4clrst_args *);
415 void
416 rfs4_ss_pnfree(rfs4_ss_pn_t *ss_pn)
418 kmem_free(ss_pn, sizeof (rfs4_ss_pn_t));
421 static rfs4_ss_pn_t *
422 rfs4_ss_pnalloc(char *dir, char *leaf)
424 rfs4_ss_pn_t *ss_pn;
425 int dir_len, leaf_len;
428 * validate we have a resonable path
429 * (account for the '/' and trailing null)
431 if ((dir_len = strlen(dir)) > MAXPATHLEN ||
432 (leaf_len = strlen(leaf)) > MAXNAMELEN ||
433 (dir_len + leaf_len + 2) > MAXPATHLEN) {
434 return (NULL);
437 ss_pn = kmem_alloc(sizeof (rfs4_ss_pn_t), KM_SLEEP);
439 (void) snprintf(ss_pn->pn, MAXPATHLEN, "%s/%s", dir, leaf);
440 /* Handy pointer to just the leaf name */
441 ss_pn->leaf = ss_pn->pn + dir_len + 1;
442 return (ss_pn);
447 * Move the "leaf" filename from "sdir" directory
448 * to the "ddir" directory. Return the pathname of
449 * the destination unless the rename fails in which
450 * case we need to return the source pathname.
452 static rfs4_ss_pn_t *
453 rfs4_ss_movestate(char *sdir, char *ddir, char *leaf)
455 rfs4_ss_pn_t *src, *dst;
457 if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL)
458 return (NULL);
460 if ((dst = rfs4_ss_pnalloc(ddir, leaf)) == NULL) {
461 rfs4_ss_pnfree(src);
462 return (NULL);
466 * If the rename fails we shall return the src
467 * pathname and free the dst. Otherwise we need
468 * to free the src and return the dst pathanme.
470 if (vn_rename(src->pn, dst->pn, UIO_SYSSPACE)) {
471 rfs4_ss_pnfree(dst);
472 return (src);
474 rfs4_ss_pnfree(src);
475 return (dst);
479 static rfs4_oldstate_t *
480 rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn)
482 struct uio uio;
483 struct iovec iov[3];
485 rfs4_oldstate_t *cl_ss = NULL;
486 vnode_t *vp;
487 vattr_t va;
488 uint_t id_len;
489 int err, kill_file, file_vers;
491 if (ss_pn == NULL)
492 return (NULL);
495 * open the state file.
497 if (vn_open(ss_pn->pn, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0) != 0) {
498 return (NULL);
501 if (vp->v_type != VREG) {
502 (void) fop_close(vp, FREAD, 1, 0, CRED(), NULL);
503 VN_RELE(vp);
504 return (NULL);
507 err = fop_access(vp, VREAD, 0, CRED(), NULL);
508 if (err) {
510 * We don't have read access? better get the heck out.
512 (void) fop_close(vp, FREAD, 1, 0, CRED(), NULL);
513 VN_RELE(vp);
514 return (NULL);
517 (void) fop_rwlock(vp, V_WRITELOCK_FALSE, NULL);
519 * get the file size to do some basic validation
521 va.va_mask = VATTR_SIZE;
522 err = fop_getattr(vp, &va, 0, CRED(), NULL);
524 kill_file = (va.va_size == 0 || va.va_size <
525 (NFS4_VERIFIER_SIZE + sizeof (uint_t)+1));
527 if (err || kill_file) {
528 fop_rwunlock(vp, V_WRITELOCK_FALSE, NULL);
529 (void) fop_close(vp, FREAD, 1, 0, CRED(), NULL);
530 VN_RELE(vp);
531 if (kill_file) {
532 (void) fop_remove(dvp, ss_pn->leaf, CRED(), NULL, 0);
534 return (NULL);
537 cl_ss = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
540 * build iovecs to read in the file_version, verifier and id_len
542 iov[0].iov_base = (caddr_t)&file_vers;
543 iov[0].iov_len = sizeof (int);
544 iov[1].iov_base = (caddr_t)&cl_ss->cl_id4.verifier;
545 iov[1].iov_len = NFS4_VERIFIER_SIZE;
546 iov[2].iov_base = (caddr_t)&id_len;
547 iov[2].iov_len = sizeof (uint_t);
549 uio.uio_iov = iov;
550 uio.uio_iovcnt = 3;
551 uio.uio_segflg = UIO_SYSSPACE;
552 uio.uio_loffset = 0;
553 uio.uio_resid = sizeof (int) + NFS4_VERIFIER_SIZE + sizeof (uint_t);
555 if (err = fop_read(vp, &uio, FREAD, CRED(), NULL)) {
556 fop_rwunlock(vp, V_WRITELOCK_FALSE, NULL);
557 (void) fop_close(vp, FREAD, 1, 0, CRED(), NULL);
558 VN_RELE(vp);
559 kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
560 return (NULL);
564 * if the file_version doesn't match or if the
565 * id_len is zero or the combination of the verifier,
566 * id_len and id_val is bigger than the file we have
567 * a problem. If so ditch the file.
569 kill_file = (file_vers != NFS4_SS_VERSION || id_len == 0 ||
570 (id_len + NFS4_VERIFIER_SIZE + sizeof (uint_t)) > va.va_size);
572 if (err || kill_file) {
573 fop_rwunlock(vp, V_WRITELOCK_FALSE, NULL);
574 (void) fop_close(vp, FREAD, 1, 0, CRED(), NULL);
575 VN_RELE(vp);
576 kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
577 if (kill_file) {
578 (void) fop_remove(dvp, ss_pn->leaf, CRED(), NULL, 0);
580 return (NULL);
584 * now get the client id value
586 cl_ss->cl_id4.id_val = kmem_alloc(id_len, KM_SLEEP);
587 iov[0].iov_base = cl_ss->cl_id4.id_val;
588 iov[0].iov_len = id_len;
590 uio.uio_iov = iov;
591 uio.uio_iovcnt = 1;
592 uio.uio_segflg = UIO_SYSSPACE;
593 uio.uio_resid = cl_ss->cl_id4.id_len = id_len;
595 if (err = fop_read(vp, &uio, FREAD, CRED(), NULL)) {
596 fop_rwunlock(vp, V_WRITELOCK_FALSE, NULL);
597 (void) fop_close(vp, FREAD, 1, 0, CRED(), NULL);
598 VN_RELE(vp);
599 kmem_free(cl_ss->cl_id4.id_val, id_len);
600 kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
601 return (NULL);
604 fop_rwunlock(vp, V_WRITELOCK_FALSE, NULL);
605 (void) fop_close(vp, FREAD, 1, 0, CRED(), NULL);
606 VN_RELE(vp);
607 return (cl_ss);
610 #ifdef nextdp
611 #undef nextdp
612 #endif
613 #define nextdp(dp) ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
616 * Add entries from statedir to supplied oldstate list.
617 * Optionally, move all entries from statedir -> destdir.
619 void
620 rfs4_ss_oldstate(rfs4_oldstate_t *oldstate, char *statedir, char *destdir)
622 rfs4_ss_pn_t *ss_pn;
623 rfs4_oldstate_t *cl_ss = NULL;
624 char *dirt = NULL;
625 int err, dir_eof = 0, size = 0;
626 vnode_t *dvp;
627 struct iovec iov;
628 struct uio uio;
629 struct dirent64 *dep;
630 offset_t dirchunk_offset = 0;
633 * open the state directory
635 if (vn_open(statedir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0))
636 return;
638 if (dvp->v_type != VDIR || fop_access(dvp, VREAD, 0, CRED(), NULL))
639 goto out;
641 dirt = kmem_alloc(RFS4_SS_DIRSIZE, KM_SLEEP);
644 * Get and process the directory entries
646 while (!dir_eof) {
647 (void) fop_rwlock(dvp, V_WRITELOCK_FALSE, NULL);
648 iov.iov_base = dirt;
649 iov.iov_len = RFS4_SS_DIRSIZE;
650 uio.uio_iov = &iov;
651 uio.uio_iovcnt = 1;
652 uio.uio_segflg = UIO_SYSSPACE;
653 uio.uio_loffset = dirchunk_offset;
654 uio.uio_resid = RFS4_SS_DIRSIZE;
656 err = fop_readdir(dvp, &uio, CRED(), &dir_eof, NULL, 0);
657 fop_rwunlock(dvp, V_WRITELOCK_FALSE, NULL);
658 if (err)
659 goto out;
661 size = RFS4_SS_DIRSIZE - uio.uio_resid;
664 * Process all the directory entries in this
665 * readdir chunk
667 for (dep = (struct dirent64 *)dirt; size > 0;
668 dep = nextdp(dep)) {
670 size -= dep->d_reclen;
671 dirchunk_offset = dep->d_off;
674 * Skip '.' and '..'
676 if (NFS_IS_DOTNAME(dep->d_name))
677 continue;
679 ss_pn = rfs4_ss_pnalloc(statedir, dep->d_name);
680 if (ss_pn == NULL)
681 continue;
683 if (cl_ss = rfs4_ss_getstate(dvp, ss_pn)) {
684 if (destdir != NULL) {
685 rfs4_ss_pnfree(ss_pn);
686 cl_ss->ss_pn = rfs4_ss_movestate(
687 statedir, destdir, dep->d_name);
688 } else {
689 cl_ss->ss_pn = ss_pn;
691 insque(cl_ss, oldstate);
692 } else {
693 rfs4_ss_pnfree(ss_pn);
698 out:
699 (void) fop_close(dvp, FREAD, 1, 0, CRED(), NULL);
700 VN_RELE(dvp);
701 if (dirt)
702 kmem_free((caddr_t)dirt, RFS4_SS_DIRSIZE);
705 static void
706 rfs4_ss_init(void)
708 int npaths = 1;
709 char *default_dss_path = NFS4_DSS_VAR_DIR;
711 /* read the default stable storage state */
712 rfs4_dss_readstate(npaths, &default_dss_path);
714 rfs4_ss_enabled = 1;
717 static void
718 rfs4_ss_fini(void)
720 rfs4_servinst_t *sip;
722 mutex_enter(&rfs4_servinst_lock);
723 sip = rfs4_cur_servinst;
724 while (sip != NULL) {
725 rfs4_dss_clear_oldstate(sip);
726 sip = sip->next;
728 mutex_exit(&rfs4_servinst_lock);
732 * Remove all oldstate files referenced by this servinst.
734 static void
735 rfs4_dss_clear_oldstate(rfs4_servinst_t *sip)
737 rfs4_oldstate_t *os_head, *osp;
739 rw_enter(&sip->oldstate_lock, RW_WRITER);
740 os_head = sip->oldstate;
742 if (os_head == NULL) {
743 rw_exit(&sip->oldstate_lock);
744 return;
747 /* skip dummy entry */
748 osp = os_head->next;
749 while (osp != os_head) {
750 char *leaf = osp->ss_pn->leaf;
751 rfs4_oldstate_t *os_next;
753 rfs4_dss_remove_leaf(sip, NFS4_DSS_OLDSTATE_LEAF, leaf);
755 if (osp->cl_id4.id_val)
756 kmem_free(osp->cl_id4.id_val, osp->cl_id4.id_len);
757 rfs4_ss_pnfree(osp->ss_pn);
759 os_next = osp->next;
760 remque(osp);
761 kmem_free(osp, sizeof (rfs4_oldstate_t));
762 osp = os_next;
765 rw_exit(&sip->oldstate_lock);
769 * Form the state and oldstate paths, and read in the stable storage files.
771 void
772 rfs4_dss_readstate(int npaths, char **paths)
774 int i;
775 char *state, *oldstate;
777 state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
778 oldstate = kmem_alloc(MAXPATHLEN, KM_SLEEP);
780 for (i = 0; i < npaths; i++) {
781 char *path = paths[i];
783 (void) sprintf(state, "%s/%s", path, NFS4_DSS_STATE_LEAF);
784 (void) sprintf(oldstate, "%s/%s", path, NFS4_DSS_OLDSTATE_LEAF);
787 * Populate the current server instance's oldstate list.
789 * 1. Read stable storage data from old state directory,
790 * leaving its contents alone.
792 * 2. Read stable storage data from state directory,
793 * and move the latter's contents to old state
794 * directory.
796 rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, oldstate, NULL);
797 rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, state, oldstate);
800 kmem_free(state, MAXPATHLEN);
801 kmem_free(oldstate, MAXPATHLEN);
806 * Check if we are still in grace and if the client can be
807 * granted permission to perform reclaims.
809 void
810 rfs4_ss_chkclid(rfs4_client_t *cp)
812 rfs4_servinst_t *sip;
815 * It should be sufficient to check the oldstate data for just
816 * this client's instance. However, since our per-instance
817 * client grouping is solely temporal, HA-NFSv4 RG failover
818 * might result in clients of the same RG being partitioned into
819 * separate instances.
821 * Until the client grouping is improved, we must check the
822 * oldstate data for all instances with an active grace period.
824 * This also serves as the mechanism to remove stale oldstate data.
825 * The first time we check an instance after its grace period has
826 * expired, the oldstate data should be cleared.
828 * Start at the current instance, and walk the list backwards
829 * to the first.
831 mutex_enter(&rfs4_servinst_lock);
832 for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
833 rfs4_ss_chkclid_sip(cp, sip);
835 /* if the above check found this client, we're done */
836 if (cp->rc_can_reclaim)
837 break;
839 mutex_exit(&rfs4_servinst_lock);
842 static void
843 rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip)
845 rfs4_oldstate_t *osp, *os_head;
847 /* short circuit everything if this server instance has no oldstate */
848 rw_enter(&sip->oldstate_lock, RW_READER);
849 os_head = sip->oldstate;
850 rw_exit(&sip->oldstate_lock);
851 if (os_head == NULL)
852 return;
855 * If this server instance is no longer in a grace period then
856 * the client won't be able to reclaim. No further need for this
857 * instance's oldstate data, so it can be cleared.
859 if (!rfs4_servinst_in_grace(sip))
860 return;
862 /* this instance is still in grace; search for the clientid */
864 rw_enter(&sip->oldstate_lock, RW_READER);
866 os_head = sip->oldstate;
867 /* skip dummy entry */
868 osp = os_head->next;
869 while (osp != os_head) {
870 if (osp->cl_id4.id_len == cp->rc_nfs_client.id_len) {
871 if (bcmp(osp->cl_id4.id_val, cp->rc_nfs_client.id_val,
872 osp->cl_id4.id_len) == 0) {
873 cp->rc_can_reclaim = 1;
874 break;
877 osp = osp->next;
880 rw_exit(&sip->oldstate_lock);
884 * Place client information into stable storage: 1/3.
885 * First, generate the leaf filename, from the client's IP address and
886 * the server-generated short-hand clientid.
888 void
889 rfs4_ss_clid(rfs4_client_t *cp)
891 const char *kinet_ntop6(uchar_t *, char *, size_t);
892 char leaf[MAXNAMELEN], buf[INET6_ADDRSTRLEN];
893 struct sockaddr *ca;
894 uchar_t *b;
896 if (rfs4_ss_enabled == 0) {
897 return;
900 buf[0] = 0;
902 ca = (struct sockaddr *)&cp->rc_addr;
905 * Convert the caller's IP address to a dotted string
907 if (ca->sa_family == AF_INET) {
908 b = (uchar_t *)&((struct sockaddr_in *)ca)->sin_addr;
909 (void) sprintf(buf, "%03d.%03d.%03d.%03d", b[0] & 0xFF,
910 b[1] & 0xFF, b[2] & 0xFF, b[3] & 0xFF);
911 } else if (ca->sa_family == AF_INET6) {
912 struct sockaddr_in6 *sin6;
914 sin6 = (struct sockaddr_in6 *)ca;
915 (void) kinet_ntop6((uchar_t *)&sin6->sin6_addr,
916 buf, INET6_ADDRSTRLEN);
919 (void) snprintf(leaf, MAXNAMELEN, "%s-%llx", buf,
920 (longlong_t)cp->rc_clientid);
921 rfs4_ss_clid_write(cp, leaf);
925 * Place client information into stable storage: 2/3.
926 * DSS: distributed stable storage: the file may need to be written to
927 * multiple directories.
929 static void
930 rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf)
932 rfs4_servinst_t *sip;
935 * It should be sufficient to write the leaf file to (all) DSS paths
936 * associated with just this client's instance. However, since our
937 * per-instance client grouping is solely temporal, HA-NFSv4 RG
938 * failover might result in us losing DSS data.
940 * Until the client grouping is improved, we must write the DSS data
941 * to all instances' paths. Start at the current instance, and
942 * walk the list backwards to the first.
944 mutex_enter(&rfs4_servinst_lock);
945 for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
946 int i, npaths = sip->dss_npaths;
948 /* write the leaf file to all DSS paths */
949 for (i = 0; i < npaths; i++) {
950 rfs4_dss_path_t *dss_path = sip->dss_paths[i];
952 /* HA-NFSv4 path might have been failed-away from us */
953 if (dss_path == NULL)
954 continue;
956 rfs4_ss_clid_write_one(cp, dss_path->path, leaf);
959 mutex_exit(&rfs4_servinst_lock);
963 * Place client information into stable storage: 3/3.
964 * Write the stable storage data to the requested file.
966 static void
967 rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dss_path, char *leaf)
969 int ioflag;
970 int file_vers = NFS4_SS_VERSION;
971 size_t dirlen;
972 struct uio uio;
973 struct iovec iov[4];
974 char *dir;
975 rfs4_ss_pn_t *ss_pn;
976 vnode_t *vp;
977 nfs_client_id4 *cl_id4 = &(cp->rc_nfs_client);
979 /* allow 2 extra bytes for '/' & NUL */
980 dirlen = strlen(dss_path) + strlen(NFS4_DSS_STATE_LEAF) + 2;
981 dir = kmem_alloc(dirlen, KM_SLEEP);
982 (void) sprintf(dir, "%s/%s", dss_path, NFS4_DSS_STATE_LEAF);
984 ss_pn = rfs4_ss_pnalloc(dir, leaf);
985 /* rfs4_ss_pnalloc takes its own copy */
986 kmem_free(dir, dirlen);
987 if (ss_pn == NULL)
988 return;
990 if (vn_open(ss_pn->pn, UIO_SYSSPACE, FCREAT|FWRITE, 0600, &vp,
991 CRCREAT, 0)) {
992 rfs4_ss_pnfree(ss_pn);
993 return;
997 * We need to record leaf - i.e. the filename - so that we know
998 * what to remove, in the future. However, the dir part of cp->ss_pn
999 * should never be referenced directly, since it's potentially only
1000 * one of several paths with this leaf in it.
1002 if (cp->rc_ss_pn != NULL) {
1003 if (strcmp(cp->rc_ss_pn->leaf, leaf) == 0) {
1004 /* we've already recorded *this* leaf */
1005 rfs4_ss_pnfree(ss_pn);
1006 } else {
1007 /* replace with this leaf */
1008 rfs4_ss_pnfree(cp->rc_ss_pn);
1009 cp->rc_ss_pn = ss_pn;
1011 } else {
1012 cp->rc_ss_pn = ss_pn;
1016 * Build a scatter list that points to the nfs_client_id4
1018 iov[0].iov_base = (caddr_t)&file_vers;
1019 iov[0].iov_len = sizeof (int);
1020 iov[1].iov_base = (caddr_t)&(cl_id4->verifier);
1021 iov[1].iov_len = NFS4_VERIFIER_SIZE;
1022 iov[2].iov_base = (caddr_t)&(cl_id4->id_len);
1023 iov[2].iov_len = sizeof (uint_t);
1024 iov[3].iov_base = (caddr_t)cl_id4->id_val;
1025 iov[3].iov_len = cl_id4->id_len;
1027 uio.uio_iov = iov;
1028 uio.uio_iovcnt = 4;
1029 uio.uio_loffset = 0;
1030 uio.uio_segflg = UIO_SYSSPACE;
1031 uio.uio_llimit = (rlim64_t)MAXOFFSET_T;
1032 uio.uio_resid = cl_id4->id_len + sizeof (int) +
1033 NFS4_VERIFIER_SIZE + sizeof (uint_t);
1035 ioflag = uio.uio_fmode = (FWRITE|FSYNC);
1036 uio.uio_extflg = UIO_COPY_DEFAULT;
1038 (void) fop_rwlock(vp, V_WRITELOCK_TRUE, NULL);
1039 /* write the full client id to the file. */
1040 (void) fop_write(vp, &uio, ioflag, CRED(), NULL);
1041 fop_rwunlock(vp, V_WRITELOCK_TRUE, NULL);
1043 (void) fop_close(vp, FWRITE, 1, 0, CRED(), NULL);
1044 VN_RELE(vp);
1048 * DSS: distributed stable storage.
1049 * Unpack the list of paths passed by nfsd.
1050 * Use nvlist_alloc(9F) to manage the data.
1051 * The caller is responsible for allocating and freeing the buffer.
1054 rfs4_dss_setpaths(char *buf, size_t buflen)
1056 int error;
1059 * If this is a "warm start", i.e. we previously had DSS paths,
1060 * preserve the old paths.
1062 if (rfs4_dss_paths != NULL) {
1064 * Before we lose the ptr, destroy the nvlist and pathnames
1065 * array from the warm start before this one.
1067 nvlist_free(rfs4_dss_oldpaths);
1068 rfs4_dss_oldpaths = rfs4_dss_paths;
1071 /* unpack the buffer into a searchable nvlist */
1072 error = nvlist_unpack(buf, buflen, &rfs4_dss_paths, KM_SLEEP);
1073 if (error)
1074 return (error);
1077 * Search the nvlist for the pathnames nvpair (which is the only nvpair
1078 * in the list, and record its location.
1080 error = nvlist_lookup_string_array(rfs4_dss_paths, NFS4_DSS_NVPAIR_NAME,
1081 &rfs4_dss_newpaths, &rfs4_dss_numnewpaths);
1082 return (error);
1086 * Ultimately the nfssys() call NFS4_CLR_STATE endsup here
1087 * to find and mark the client for forced expire.
1089 static void
1090 rfs4_client_scrub(rfs4_entry_t ent, void *arg)
1092 rfs4_client_t *cp = (rfs4_client_t *)ent;
1093 struct nfs4clrst_args *clr = arg;
1094 struct sockaddr_in6 *ent_sin6;
1095 struct in6_addr clr_in6;
1096 struct sockaddr_in *ent_sin;
1097 struct in_addr clr_in;
1099 if (clr->addr_type != cp->rc_addr.ss_family) {
1100 return;
1103 switch (clr->addr_type) {
1105 case AF_INET6:
1106 /* copyin the address from user space */
1107 if (copyin(clr->ap, &clr_in6, sizeof (clr_in6))) {
1108 break;
1111 ent_sin6 = (struct sockaddr_in6 *)&cp->rc_addr;
1114 * now compare, and if equivalent mark entry
1115 * for forced expiration
1117 if (IN6_ARE_ADDR_EQUAL(&ent_sin6->sin6_addr, &clr_in6)) {
1118 cp->rc_forced_expire = 1;
1120 break;
1122 case AF_INET:
1123 /* copyin the address from user space */
1124 if (copyin(clr->ap, &clr_in, sizeof (clr_in))) {
1125 break;
1128 ent_sin = (struct sockaddr_in *)&cp->rc_addr;
1131 * now compare, and if equivalent mark entry
1132 * for forced expiration
1134 if (ent_sin->sin_addr.s_addr == clr_in.s_addr) {
1135 cp->rc_forced_expire = 1;
1137 break;
1139 default:
1140 /* force this assert to fail */
1141 ASSERT(clr->addr_type != clr->addr_type);
1146 * This is called from nfssys() in order to clear server state
1147 * for the specified client IP Address.
1149 void
1150 rfs4_clear_client_state(struct nfs4clrst_args *clr)
1152 (void) rfs4_dbe_walk(rfs4_client_tab, rfs4_client_scrub, clr);
1156 * Used to initialize the NFSv4 server's state or database. All of
1157 * the tables are created and timers are set. Only called when NFSv4
1158 * service is provided.
1160 void
1161 rfs4_state_init()
1163 int start_grace;
1164 extern boolean_t rfs4_cpr_callb(void *, int);
1165 char *dss_path = NFS4_DSS_VAR_DIR;
1166 time_t start_time;
1168 mutex_enter(&rfs4_state_lock);
1171 * If the server state database has already been initialized,
1172 * skip it
1174 if (rfs4_server_state != NULL) {
1175 mutex_exit(&rfs4_state_lock);
1176 return;
1179 rw_init(&rfs4_findclient_lock, NULL, RW_DEFAULT, NULL);
1182 * Set the boot time. If the server
1183 * has been restarted quickly and has had the opportunity to
1184 * service clients, then the start_time needs to be bumped
1185 * regardless. A small window but it exists...
1187 start_time = gethrestime_sec();
1188 if (rfs4_start_time < start_time)
1189 rfs4_start_time = start_time;
1190 else
1191 rfs4_start_time++;
1193 /* DSS: distributed stable storage: initialise served paths list */
1194 rfs4_dss_pathlist = NULL;
1197 * Create the first server instance, or a new one if the server has
1198 * been restarted; see above comments on rfs4_start_time. Don't
1199 * start its grace period; that will be done later, to maximise the
1200 * clients' recovery window.
1202 start_grace = 0;
1203 rfs4_servinst_create(start_grace, 1, &dss_path);
1205 /* reset the "first NFSv4 request" status */
1206 rfs4_seen_first_compound = 0;
1209 * Add a CPR callback so that we can update client
1210 * access times to extend the lease after a suspend
1211 * and resume (using the same class as rpcmod/connmgr)
1213 cpr_id = callb_add(rfs4_cpr_callb, 0, CB_CL_CPR_RPC, "rfs4");
1215 /* set the various cache timers for table creation */
1216 if (rfs4_client_cache_time == 0)
1217 rfs4_client_cache_time = CLIENT_CACHE_TIME;
1218 if (rfs4_openowner_cache_time == 0)
1219 rfs4_openowner_cache_time = OPENOWNER_CACHE_TIME;
1220 if (rfs4_state_cache_time == 0)
1221 rfs4_state_cache_time = STATE_CACHE_TIME;
1222 if (rfs4_lo_state_cache_time == 0)
1223 rfs4_lo_state_cache_time = LO_STATE_CACHE_TIME;
1224 if (rfs4_lockowner_cache_time == 0)
1225 rfs4_lockowner_cache_time = LOCKOWNER_CACHE_TIME;
1226 if (rfs4_file_cache_time == 0)
1227 rfs4_file_cache_time = FILE_CACHE_TIME;
1228 if (rfs4_deleg_state_cache_time == 0)
1229 rfs4_deleg_state_cache_time = DELEG_STATE_CACHE_TIME;
1231 /* Create the overall database to hold all server state */
1232 rfs4_server_state = rfs4_database_create(rfs4_database_debug);
1234 /* Now create the individual tables */
1235 rfs4_client_cache_time *= rfs4_lease_time;
1236 rfs4_client_tab = rfs4_table_create(rfs4_server_state,
1237 "Client",
1238 rfs4_client_cache_time,
1240 rfs4_client_create,
1241 rfs4_client_destroy,
1242 rfs4_client_expiry,
1243 sizeof (rfs4_client_t),
1244 TABSIZE,
1245 MAXTABSZ/8, 100);
1246 rfs4_nfsclnt_idx = rfs4_index_create(rfs4_client_tab,
1247 "nfs_client_id4", nfsclnt_hash,
1248 nfsclnt_compare, nfsclnt_mkkey,
1249 TRUE);
1250 rfs4_clientid_idx = rfs4_index_create(rfs4_client_tab,
1251 "client_id", clientid_hash,
1252 clientid_compare, clientid_mkkey,
1253 FALSE);
1255 rfs4_clntip_cache_time = 86400 * 365; /* about a year */
1256 rfs4_clntip_tab = rfs4_table_create(rfs4_server_state,
1257 "ClntIP",
1258 rfs4_clntip_cache_time,
1260 rfs4_clntip_create,
1261 rfs4_clntip_destroy,
1262 rfs4_clntip_expiry,
1263 sizeof (rfs4_clntip_t),
1264 TABSIZE,
1265 MAXTABSZ, 100);
1266 rfs4_clntip_idx = rfs4_index_create(rfs4_clntip_tab,
1267 "client_ip", clntip_hash,
1268 clntip_compare, clntip_mkkey,
1269 TRUE);
1271 rfs4_openowner_cache_time *= rfs4_lease_time;
1272 rfs4_openowner_tab = rfs4_table_create(rfs4_server_state,
1273 "OpenOwner",
1274 rfs4_openowner_cache_time,
1276 rfs4_openowner_create,
1277 rfs4_openowner_destroy,
1278 rfs4_openowner_expiry,
1279 sizeof (rfs4_openowner_t),
1280 TABSIZE,
1281 MAXTABSZ, 100);
1282 rfs4_openowner_idx = rfs4_index_create(rfs4_openowner_tab,
1283 "open_owner4", openowner_hash,
1284 openowner_compare,
1285 openowner_mkkey, TRUE);
1287 rfs4_state_cache_time *= rfs4_lease_time;
1288 rfs4_state_tab = rfs4_table_create(rfs4_server_state,
1289 "OpenStateID",
1290 rfs4_state_cache_time,
1292 rfs4_state_create,
1293 rfs4_state_destroy,
1294 rfs4_state_expiry,
1295 sizeof (rfs4_state_t),
1296 TABSIZE,
1297 MAXTABSZ, 100);
1299 rfs4_state_owner_file_idx = rfs4_index_create(rfs4_state_tab,
1300 "Openowner-File",
1301 state_owner_file_hash,
1302 state_owner_file_compare,
1303 state_owner_file_mkkey, TRUE);
1305 rfs4_state_idx = rfs4_index_create(rfs4_state_tab,
1306 "State-id", state_hash,
1307 state_compare, state_mkkey, FALSE);
1309 rfs4_state_file_idx = rfs4_index_create(rfs4_state_tab,
1310 "File", state_file_hash,
1311 state_file_compare, state_file_mkkey,
1312 FALSE);
1314 rfs4_lo_state_cache_time *= rfs4_lease_time;
1315 rfs4_lo_state_tab = rfs4_table_create(rfs4_server_state,
1316 "LockStateID",
1317 rfs4_lo_state_cache_time,
1319 rfs4_lo_state_create,
1320 rfs4_lo_state_destroy,
1321 rfs4_lo_state_expiry,
1322 sizeof (rfs4_lo_state_t),
1323 TABSIZE,
1324 MAXTABSZ, 100);
1326 rfs4_lo_state_owner_idx = rfs4_index_create(rfs4_lo_state_tab,
1327 "lockownerxstate",
1328 lo_state_lo_hash,
1329 lo_state_lo_compare,
1330 lo_state_lo_mkkey, TRUE);
1332 rfs4_lo_state_idx = rfs4_index_create(rfs4_lo_state_tab,
1333 "State-id",
1334 lo_state_hash, lo_state_compare,
1335 lo_state_mkkey, FALSE);
1337 rfs4_lockowner_cache_time *= rfs4_lease_time;
1339 rfs4_lockowner_tab = rfs4_table_create(rfs4_server_state,
1340 "Lockowner",
1341 rfs4_lockowner_cache_time,
1343 rfs4_lockowner_create,
1344 rfs4_lockowner_destroy,
1345 rfs4_lockowner_expiry,
1346 sizeof (rfs4_lockowner_t),
1347 TABSIZE,
1348 MAXTABSZ, 100);
1350 rfs4_lockowner_idx = rfs4_index_create(rfs4_lockowner_tab,
1351 "lock_owner4", lockowner_hash,
1352 lockowner_compare,
1353 lockowner_mkkey, TRUE);
1355 rfs4_lockowner_pid_idx = rfs4_index_create(rfs4_lockowner_tab,
1356 "pid", pid_hash,
1357 pid_compare, pid_mkkey,
1358 FALSE);
1360 rfs4_file_cache_time *= rfs4_lease_time;
1361 rfs4_file_tab = rfs4_table_create(rfs4_server_state,
1362 "File",
1363 rfs4_file_cache_time,
1365 rfs4_file_create,
1366 rfs4_file_destroy,
1367 NULL,
1368 sizeof (rfs4_file_t),
1369 TABSIZE,
1370 MAXTABSZ, -1);
1372 rfs4_file_idx = rfs4_index_create(rfs4_file_tab,
1373 "Filehandle", file_hash,
1374 file_compare, file_mkkey, TRUE);
1376 rfs4_deleg_state_cache_time *= rfs4_lease_time;
1377 rfs4_deleg_state_tab = rfs4_table_create(rfs4_server_state,
1378 "DelegStateID",
1379 rfs4_deleg_state_cache_time,
1381 rfs4_deleg_state_create,
1382 rfs4_deleg_state_destroy,
1383 rfs4_deleg_state_expiry,
1384 sizeof (rfs4_deleg_state_t),
1385 TABSIZE,
1386 MAXTABSZ, 100);
1387 rfs4_deleg_idx = rfs4_index_create(rfs4_deleg_state_tab,
1388 "DelegByFileClient",
1389 deleg_hash,
1390 deleg_compare,
1391 deleg_mkkey, TRUE);
1393 rfs4_deleg_state_idx = rfs4_index_create(rfs4_deleg_state_tab,
1394 "DelegState",
1395 deleg_state_hash,
1396 deleg_state_compare,
1397 deleg_state_mkkey, FALSE);
1400 * Init the stable storage.
1402 rfs4_ss_init();
1404 rfs4_client_clrst = rfs4_clear_client_state;
1406 mutex_exit(&rfs4_state_lock);
1411 * Used at server shutdown to cleanup all of the NFSv4 server's structures
1412 * and other state.
1414 void
1415 rfs4_state_fini()
1417 rfs4_database_t *dbp;
1419 mutex_enter(&rfs4_state_lock);
1421 if (rfs4_server_state == NULL) {
1422 mutex_exit(&rfs4_state_lock);
1423 return;
1426 rfs4_client_clrst = NULL;
1428 rfs4_set_deleg_policy(SRV_NEVER_DELEGATE);
1429 dbp = rfs4_server_state;
1430 rfs4_server_state = NULL;
1433 * Cleanup the CPR callback.
1435 if (cpr_id)
1436 (void) callb_delete(cpr_id);
1438 rw_destroy(&rfs4_findclient_lock);
1440 /* First stop all of the reaper threads in the database */
1441 rfs4_database_shutdown(dbp);
1442 /* clean up any dangling stable storage structures */
1443 rfs4_ss_fini();
1444 /* Now actually destroy/release the database and its tables */
1445 rfs4_database_destroy(dbp);
1447 /* Reset the cache timers for next time */
1448 rfs4_client_cache_time = 0;
1449 rfs4_openowner_cache_time = 0;
1450 rfs4_state_cache_time = 0;
1451 rfs4_lo_state_cache_time = 0;
1452 rfs4_lockowner_cache_time = 0;
1453 rfs4_file_cache_time = 0;
1454 rfs4_deleg_state_cache_time = 0;
1456 mutex_exit(&rfs4_state_lock);
1458 /* destroy server instances and current instance ptr */
1459 rfs4_servinst_destroy_all();
1461 /* reset the "first NFSv4 request" status */
1462 rfs4_seen_first_compound = 0;
1464 /* DSS: distributed stable storage */
1465 nvlist_free(rfs4_dss_oldpaths);
1466 nvlist_free(rfs4_dss_paths);
1467 rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
1470 typedef union {
1471 struct {
1472 uint32_t start_time;
1473 uint32_t c_id;
1474 } impl_id;
1475 clientid4 id4;
1476 } cid;
1478 typedef union {
1479 struct {
1480 uint32_t c_id;
1481 uint32_t gen_num;
1482 } cv_impl;
1483 verifier4 confirm_verf;
1484 } scid_confirm_verf;
1486 static uint32_t
1487 clientid_hash(void *key)
1489 cid *idp = key;
1491 return (idp->impl_id.c_id);
1494 static bool_t
1495 clientid_compare(rfs4_entry_t entry, void *key)
1497 rfs4_client_t *cp = (rfs4_client_t *)entry;
1498 clientid4 *idp = key;
1500 return (*idp == cp->rc_clientid);
1503 static void *
1504 clientid_mkkey(rfs4_entry_t entry)
1506 rfs4_client_t *cp = (rfs4_client_t *)entry;
1508 return (&cp->rc_clientid);
1511 static uint32_t
1512 nfsclnt_hash(void *key)
1514 nfs_client_id4 *client = key;
1515 int i;
1516 uint32_t hash = 0;
1518 for (i = 0; i < client->id_len; i++) {
1519 hash <<= 1;
1520 hash += (uint_t)client->id_val[i];
1522 return (hash);
1526 static bool_t
1527 nfsclnt_compare(rfs4_entry_t entry, void *key)
1529 rfs4_client_t *cp = (rfs4_client_t *)entry;
1530 nfs_client_id4 *nfs_client = key;
1532 if (cp->rc_nfs_client.id_len != nfs_client->id_len)
1533 return (FALSE);
1535 return (bcmp(cp->rc_nfs_client.id_val, nfs_client->id_val,
1536 nfs_client->id_len) == 0);
1539 static void *
1540 nfsclnt_mkkey(rfs4_entry_t entry)
1542 rfs4_client_t *cp = (rfs4_client_t *)entry;
1544 return (&cp->rc_nfs_client);
1547 static bool_t
1548 rfs4_client_expiry(rfs4_entry_t u_entry)
1550 rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1551 bool_t cp_expired;
1553 if (rfs4_dbe_is_invalid(cp->rc_dbe)) {
1554 cp->rc_ss_remove = 1;
1555 return (TRUE);
1558 * If the sysadmin has used clear_locks for this
1559 * entry then forced_expire will be set and we
1560 * want this entry to be reaped. Or the entry
1561 * has exceeded its lease period.
1563 cp_expired = (cp->rc_forced_expire ||
1564 (gethrestime_sec() - cp->rc_last_access
1565 > rfs4_lease_time));
1567 if (!cp->rc_ss_remove && cp_expired)
1568 cp->rc_ss_remove = 1;
1569 return (cp_expired);
1573 * Remove the leaf file from all distributed stable storage paths.
1575 static void
1576 rfs4_dss_remove_cpleaf(rfs4_client_t *cp)
1578 rfs4_servinst_t *sip;
1579 char *leaf = cp->rc_ss_pn->leaf;
1582 * since the state files are written to all DSS
1583 * paths we must remove this leaf file instance
1584 * from all server instances.
1587 mutex_enter(&rfs4_servinst_lock);
1588 for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
1589 /* remove the leaf file associated with this server instance */
1590 rfs4_dss_remove_leaf(sip, NFS4_DSS_STATE_LEAF, leaf);
1592 mutex_exit(&rfs4_servinst_lock);
1595 static void
1596 rfs4_dss_remove_leaf(rfs4_servinst_t *sip, char *dir_leaf, char *leaf)
1598 int i, npaths = sip->dss_npaths;
1600 for (i = 0; i < npaths; i++) {
1601 rfs4_dss_path_t *dss_path = sip->dss_paths[i];
1602 char *path, *dir;
1603 size_t pathlen;
1605 /* the HA-NFSv4 path might have been failed-over away from us */
1606 if (dss_path == NULL)
1607 continue;
1609 dir = dss_path->path;
1611 /* allow 3 extra bytes for two '/' & a NUL */
1612 pathlen = strlen(dir) + strlen(dir_leaf) + strlen(leaf) + 3;
1613 path = kmem_alloc(pathlen, KM_SLEEP);
1614 (void) sprintf(path, "%s/%s/%s", dir, dir_leaf, leaf);
1616 (void) vn_remove(path, UIO_SYSSPACE, RMFILE);
1618 kmem_free(path, pathlen);
1622 static void
1623 rfs4_client_destroy(rfs4_entry_t u_entry)
1625 rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1627 mutex_destroy(cp->rc_cbinfo.cb_lock);
1628 cv_destroy(cp->rc_cbinfo.cb_cv);
1629 cv_destroy(cp->rc_cbinfo.cb_cv_nullcaller);
1630 list_destroy(&cp->rc_openownerlist);
1632 /* free callback info */
1633 rfs4_cbinfo_free(&cp->rc_cbinfo);
1635 if (cp->rc_cp_confirmed)
1636 rfs4_client_rele(cp->rc_cp_confirmed);
1638 if (cp->rc_ss_pn) {
1639 /* check if the stable storage files need to be removed */
1640 if (cp->rc_ss_remove)
1641 rfs4_dss_remove_cpleaf(cp);
1642 rfs4_ss_pnfree(cp->rc_ss_pn);
1645 /* Free the client supplied client id */
1646 kmem_free(cp->rc_nfs_client.id_val, cp->rc_nfs_client.id_len);
1648 if (cp->rc_sysidt != LM_NOSYSID)
1649 lm_free_sysidt(cp->rc_sysidt);
1652 static bool_t
1653 rfs4_client_create(rfs4_entry_t u_entry, void *arg)
1655 rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1656 nfs_client_id4 *client = (nfs_client_id4 *)arg;
1657 struct sockaddr *ca;
1658 cid *cidp;
1659 scid_confirm_verf *scvp;
1661 /* Get a clientid to give to the client */
1662 cidp = (cid *)&cp->rc_clientid;
1663 cidp->impl_id.start_time = rfs4_start_time;
1664 cidp->impl_id.c_id = (uint32_t)rfs4_dbe_getid(cp->rc_dbe);
1666 /* Allocate and copy client's client id value */
1667 cp->rc_nfs_client.id_val = kmem_alloc(client->id_len, KM_SLEEP);
1668 cp->rc_nfs_client.id_len = client->id_len;
1669 bcopy(client->id_val, cp->rc_nfs_client.id_val, client->id_len);
1670 cp->rc_nfs_client.verifier = client->verifier;
1672 /* Copy client's IP address */
1673 ca = client->cl_addr;
1674 if (ca->sa_family == AF_INET)
1675 bcopy(ca, &cp->rc_addr, sizeof (struct sockaddr_in));
1676 else if (ca->sa_family == AF_INET6)
1677 bcopy(ca, &cp->rc_addr, sizeof (struct sockaddr_in6));
1678 cp->rc_nfs_client.cl_addr = (struct sockaddr *)&cp->rc_addr;
1680 /* Init the value for the SETCLIENTID_CONFIRM verifier */
1681 scvp = (scid_confirm_verf *)&cp->rc_confirm_verf;
1682 scvp->cv_impl.c_id = cidp->impl_id.c_id;
1683 scvp->cv_impl.gen_num = 0;
1685 /* An F_UNLKSYS has been done for this client */
1686 cp->rc_unlksys_completed = FALSE;
1688 /* We need the client to ack us */
1689 cp->rc_need_confirm = TRUE;
1690 cp->rc_cp_confirmed = NULL;
1692 /* TRUE all the time until the callback path actually fails */
1693 cp->rc_cbinfo.cb_notified_of_cb_path_down = TRUE;
1695 /* Initialize the access time to now */
1696 cp->rc_last_access = gethrestime_sec();
1698 cp->rc_cr_set = NULL;
1700 cp->rc_sysidt = LM_NOSYSID;
1702 list_create(&cp->rc_openownerlist, sizeof (rfs4_openowner_t),
1703 offsetof(rfs4_openowner_t, ro_node));
1705 /* set up the callback control structure */
1706 cp->rc_cbinfo.cb_state = CB_UNINIT;
1707 mutex_init(cp->rc_cbinfo.cb_lock, NULL, MUTEX_DEFAULT, NULL);
1708 cv_init(cp->rc_cbinfo.cb_cv, NULL, CV_DEFAULT, NULL);
1709 cv_init(cp->rc_cbinfo.cb_cv_nullcaller, NULL, CV_DEFAULT, NULL);
1712 * Associate the client_t with the current server instance.
1713 * The hold is solely to satisfy the calling requirement of
1714 * rfs4_servinst_assign(). In this case it's not strictly necessary.
1716 rfs4_dbe_hold(cp->rc_dbe);
1717 rfs4_servinst_assign(cp, rfs4_cur_servinst);
1718 rfs4_dbe_rele(cp->rc_dbe);
1720 return (TRUE);
1724 * Caller wants to generate/update the setclientid_confirm verifier
1725 * associated with a client. This is done during the SETCLIENTID
1726 * processing.
1728 void
1729 rfs4_client_scv_next(rfs4_client_t *cp)
1731 scid_confirm_verf *scvp;
1733 /* Init the value for the SETCLIENTID_CONFIRM verifier */
1734 scvp = (scid_confirm_verf *)&cp->rc_confirm_verf;
1735 scvp->cv_impl.gen_num++;
1738 void
1739 rfs4_client_rele(rfs4_client_t *cp)
1741 rfs4_dbe_rele(cp->rc_dbe);
1744 rfs4_client_t *
1745 rfs4_findclient(nfs_client_id4 *client, bool_t *create, rfs4_client_t *oldcp)
1747 rfs4_client_t *cp;
1750 if (oldcp) {
1751 rw_enter(&rfs4_findclient_lock, RW_WRITER);
1752 rfs4_dbe_hide(oldcp->rc_dbe);
1753 } else {
1754 rw_enter(&rfs4_findclient_lock, RW_READER);
1757 cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_nfsclnt_idx, client,
1758 create, (void *)client, RFS4_DBS_VALID);
1760 if (oldcp)
1761 rfs4_dbe_unhide(oldcp->rc_dbe);
1763 rw_exit(&rfs4_findclient_lock);
1765 return (cp);
1768 rfs4_client_t *
1769 rfs4_findclient_by_id(clientid4 clientid, bool_t find_unconfirmed)
1771 rfs4_client_t *cp;
1772 bool_t create = FALSE;
1773 cid *cidp = (cid *)&clientid;
1775 rw_enter(&rfs4_findclient_lock, RW_READER);
1777 cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx, &clientid,
1778 &create, NULL, RFS4_DBS_VALID);
1780 rw_exit(&rfs4_findclient_lock);
1782 if (cp && cp->rc_need_confirm && find_unconfirmed == FALSE) {
1783 rfs4_client_rele(cp);
1784 return (NULL);
1785 } else {
1786 return (cp);
1790 static uint32_t
1791 clntip_hash(void *key)
1793 struct sockaddr *addr = key;
1794 int i, len = 0;
1795 uint32_t hash = 0;
1796 char *ptr;
1798 if (addr->sa_family == AF_INET) {
1799 struct sockaddr_in *a = (struct sockaddr_in *)addr;
1800 len = sizeof (struct in_addr);
1801 ptr = (char *)&a->sin_addr;
1802 } else if (addr->sa_family == AF_INET6) {
1803 struct sockaddr_in6 *a = (struct sockaddr_in6 *)addr;
1804 len = sizeof (struct in6_addr);
1805 ptr = (char *)&a->sin6_addr;
1806 } else
1807 return (0);
1809 for (i = 0; i < len; i++) {
1810 hash <<= 1;
1811 hash += (uint_t)ptr[i];
1813 return (hash);
1816 static bool_t
1817 clntip_compare(rfs4_entry_t entry, void *key)
1819 rfs4_clntip_t *cp = (rfs4_clntip_t *)entry;
1820 struct sockaddr *addr = key;
1821 int len = 0;
1822 char *p1, *p2;
1824 if (addr->sa_family == AF_INET) {
1825 struct sockaddr_in *a1 = (struct sockaddr_in *)&cp->ri_addr;
1826 struct sockaddr_in *a2 = (struct sockaddr_in *)addr;
1827 len = sizeof (struct in_addr);
1828 p1 = (char *)&a1->sin_addr;
1829 p2 = (char *)&a2->sin_addr;
1830 } else if (addr->sa_family == AF_INET6) {
1831 struct sockaddr_in6 *a1 = (struct sockaddr_in6 *)&cp->ri_addr;
1832 struct sockaddr_in6 *a2 = (struct sockaddr_in6 *)addr;
1833 len = sizeof (struct in6_addr);
1834 p1 = (char *)&a1->sin6_addr;
1835 p2 = (char *)&a2->sin6_addr;
1836 } else
1837 return (0);
1839 return (bcmp(p1, p2, len) == 0);
1842 static void *
1843 clntip_mkkey(rfs4_entry_t entry)
1845 rfs4_clntip_t *cp = (rfs4_clntip_t *)entry;
1847 return (&cp->ri_addr);
1850 static bool_t
1851 rfs4_clntip_expiry(rfs4_entry_t u_entry)
1853 rfs4_clntip_t *cp = (rfs4_clntip_t *)u_entry;
1855 if (rfs4_dbe_is_invalid(cp->ri_dbe))
1856 return (TRUE);
1857 return (FALSE);
1860 /* ARGSUSED */
1861 static void
1862 rfs4_clntip_destroy(rfs4_entry_t u_entry)
1866 static bool_t
1867 rfs4_clntip_create(rfs4_entry_t u_entry, void *arg)
1869 rfs4_clntip_t *cp = (rfs4_clntip_t *)u_entry;
1870 struct sockaddr *ca = (struct sockaddr *)arg;
1872 /* Copy client's IP address */
1873 if (ca->sa_family == AF_INET)
1874 bcopy(ca, &cp->ri_addr, sizeof (struct sockaddr_in));
1875 else if (ca->sa_family == AF_INET6)
1876 bcopy(ca, &cp->ri_addr, sizeof (struct sockaddr_in6));
1877 else
1878 return (FALSE);
1879 cp->ri_no_referrals = 1;
1881 return (TRUE);
1884 rfs4_clntip_t *
1885 rfs4_find_clntip(struct sockaddr *addr, bool_t *create)
1887 rfs4_clntip_t *cp;
1889 rw_enter(&rfs4_findclient_lock, RW_READER);
1891 cp = (rfs4_clntip_t *)rfs4_dbsearch(rfs4_clntip_idx, addr,
1892 create, addr, RFS4_DBS_VALID);
1894 rw_exit(&rfs4_findclient_lock);
1896 return (cp);
1899 void
1900 rfs4_invalidate_clntip(struct sockaddr *addr)
1902 rfs4_clntip_t *cp;
1903 bool_t create = FALSE;
1905 rw_enter(&rfs4_findclient_lock, RW_READER);
1907 cp = (rfs4_clntip_t *)rfs4_dbsearch(rfs4_clntip_idx, addr,
1908 &create, NULL, RFS4_DBS_VALID);
1909 if (cp == NULL) {
1910 rw_exit(&rfs4_findclient_lock);
1911 return;
1913 rfs4_dbe_invalidate(cp->ri_dbe);
1914 rfs4_dbe_rele(cp->ri_dbe);
1916 rw_exit(&rfs4_findclient_lock);
1919 bool_t
1920 rfs4_lease_expired(rfs4_client_t *cp)
1922 bool_t rc;
1924 rfs4_dbe_lock(cp->rc_dbe);
1927 * If the admin has executed clear_locks for this
1928 * client id, force expire will be set, so no need
1929 * to calculate anything because it's "outa here".
1931 if (cp->rc_forced_expire) {
1932 rc = TRUE;
1933 } else {
1934 rc = (gethrestime_sec() - cp->rc_last_access > rfs4_lease_time);
1938 * If the lease has expired we will also want
1939 * to remove any stable storage state data. So
1940 * mark the client id accordingly.
1942 if (!cp->rc_ss_remove)
1943 cp->rc_ss_remove = (rc == TRUE);
1945 rfs4_dbe_unlock(cp->rc_dbe);
1947 return (rc);
1950 void
1951 rfs4_update_lease(rfs4_client_t *cp)
1953 rfs4_dbe_lock(cp->rc_dbe);
1954 if (!cp->rc_forced_expire)
1955 cp->rc_last_access = gethrestime_sec();
1956 rfs4_dbe_unlock(cp->rc_dbe);
1960 static bool_t
1961 EQOPENOWNER(open_owner4 *a, open_owner4 *b)
1963 bool_t rc;
1965 if (a->clientid != b->clientid)
1966 return (FALSE);
1968 if (a->owner_len != b->owner_len)
1969 return (FALSE);
1971 rc = (bcmp(a->owner_val, b->owner_val, a->owner_len) == 0);
1973 return (rc);
1976 static uint_t
1977 openowner_hash(void *key)
1979 int i;
1980 open_owner4 *openowner = key;
1981 uint_t hash = 0;
1983 for (i = 0; i < openowner->owner_len; i++) {
1984 hash <<= 4;
1985 hash += (uint_t)openowner->owner_val[i];
1987 hash += (uint_t)openowner->clientid;
1988 hash |= (openowner->clientid >> 32);
1990 return (hash);
1993 static bool_t
1994 openowner_compare(rfs4_entry_t u_entry, void *key)
1996 rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
1997 open_owner4 *arg = key;
1999 return (EQOPENOWNER(&oo->ro_owner, arg));
2002 void *
2003 openowner_mkkey(rfs4_entry_t u_entry)
2005 rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2007 return (&oo->ro_owner);
2010 /* ARGSUSED */
2011 static bool_t
2012 rfs4_openowner_expiry(rfs4_entry_t u_entry)
2014 /* openstateid held us and did all needed delay */
2015 return (TRUE);
2018 static void
2019 rfs4_openowner_destroy(rfs4_entry_t u_entry)
2021 rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2023 /* Remove open owner from client's lists of open owners */
2024 rfs4_dbe_lock(oo->ro_client->rc_dbe);
2025 list_remove(&oo->ro_client->rc_openownerlist, oo);
2026 rfs4_dbe_unlock(oo->ro_client->rc_dbe);
2028 /* One less reference to the client */
2029 rfs4_client_rele(oo->ro_client);
2030 oo->ro_client = NULL;
2032 /* Free the last reply for this lock owner */
2033 rfs4_free_reply(&oo->ro_reply);
2035 if (oo->ro_reply_fh.nfs_fh4_val) {
2036 kmem_free(oo->ro_reply_fh.nfs_fh4_val,
2037 oo->ro_reply_fh.nfs_fh4_len);
2038 oo->ro_reply_fh.nfs_fh4_val = NULL;
2039 oo->ro_reply_fh.nfs_fh4_len = 0;
2042 rfs4_sw_destroy(&oo->ro_sw);
2043 list_destroy(&oo->ro_statelist);
2045 /* Free the lock owner id */
2046 kmem_free(oo->ro_owner.owner_val, oo->ro_owner.owner_len);
2049 void
2050 rfs4_openowner_rele(rfs4_openowner_t *oo)
2052 rfs4_dbe_rele(oo->ro_dbe);
2055 static bool_t
2056 rfs4_openowner_create(rfs4_entry_t u_entry, void *arg)
2058 rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2059 rfs4_openowner_t *argp = (rfs4_openowner_t *)arg;
2060 open_owner4 *openowner = &argp->ro_owner;
2061 seqid4 seqid = argp->ro_open_seqid;
2062 rfs4_client_t *cp;
2063 bool_t create = FALSE;
2065 rw_enter(&rfs4_findclient_lock, RW_READER);
2067 cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx,
2068 &openowner->clientid,
2069 &create, NULL, RFS4_DBS_VALID);
2071 rw_exit(&rfs4_findclient_lock);
2073 if (cp == NULL)
2074 return (FALSE);
2076 oo->ro_reply_fh.nfs_fh4_len = 0;
2077 oo->ro_reply_fh.nfs_fh4_val = NULL;
2079 oo->ro_owner.clientid = openowner->clientid;
2080 oo->ro_owner.owner_val =
2081 kmem_alloc(openowner->owner_len, KM_SLEEP);
2083 bcopy(openowner->owner_val,
2084 oo->ro_owner.owner_val, openowner->owner_len);
2086 oo->ro_owner.owner_len = openowner->owner_len;
2088 oo->ro_need_confirm = TRUE;
2090 rfs4_sw_init(&oo->ro_sw);
2092 oo->ro_open_seqid = seqid;
2093 bzero(&oo->ro_reply, sizeof (nfs_resop4));
2094 oo->ro_client = cp;
2095 oo->ro_cr_set = NULL;
2097 list_create(&oo->ro_statelist, sizeof (rfs4_state_t),
2098 offsetof(rfs4_state_t, rs_node));
2100 /* Insert openowner into client's open owner list */
2101 rfs4_dbe_lock(cp->rc_dbe);
2102 list_insert_tail(&cp->rc_openownerlist, oo);
2103 rfs4_dbe_unlock(cp->rc_dbe);
2105 return (TRUE);
2108 rfs4_openowner_t *
2109 rfs4_findopenowner(open_owner4 *openowner, bool_t *create, seqid4 seqid)
2111 rfs4_openowner_t *oo;
2112 rfs4_openowner_t arg;
2114 arg.ro_owner = *openowner;
2115 arg.ro_open_seqid = seqid;
2116 oo = (rfs4_openowner_t *)rfs4_dbsearch(rfs4_openowner_idx, openowner,
2117 create, &arg, RFS4_DBS_VALID);
2119 return (oo);
2122 void
2123 rfs4_update_open_sequence(rfs4_openowner_t *oo)
2126 rfs4_dbe_lock(oo->ro_dbe);
2128 oo->ro_open_seqid++;
2130 rfs4_dbe_unlock(oo->ro_dbe);
2133 void
2134 rfs4_update_open_resp(rfs4_openowner_t *oo, nfs_resop4 *resp, nfs_fh4 *fh)
2137 rfs4_dbe_lock(oo->ro_dbe);
2139 rfs4_free_reply(&oo->ro_reply);
2141 rfs4_copy_reply(&oo->ro_reply, resp);
2143 /* Save the filehandle if provided and free if not used */
2144 if (resp->nfs_resop4_u.opopen.status == NFS4_OK &&
2145 fh && fh->nfs_fh4_len) {
2146 if (oo->ro_reply_fh.nfs_fh4_val == NULL)
2147 oo->ro_reply_fh.nfs_fh4_val =
2148 kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2149 nfs_fh4_copy(fh, &oo->ro_reply_fh);
2150 } else {
2151 if (oo->ro_reply_fh.nfs_fh4_val) {
2152 kmem_free(oo->ro_reply_fh.nfs_fh4_val,
2153 oo->ro_reply_fh.nfs_fh4_len);
2154 oo->ro_reply_fh.nfs_fh4_val = NULL;
2155 oo->ro_reply_fh.nfs_fh4_len = 0;
2159 rfs4_dbe_unlock(oo->ro_dbe);
2162 static bool_t
2163 lockowner_compare(rfs4_entry_t u_entry, void *key)
2165 rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2166 lock_owner4 *b = (lock_owner4 *)key;
2168 if (lo->rl_owner.clientid != b->clientid)
2169 return (FALSE);
2171 if (lo->rl_owner.owner_len != b->owner_len)
2172 return (FALSE);
2174 return (bcmp(lo->rl_owner.owner_val, b->owner_val,
2175 lo->rl_owner.owner_len) == 0);
2178 void *
2179 lockowner_mkkey(rfs4_entry_t u_entry)
2181 rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2183 return (&lo->rl_owner);
2186 static uint32_t
2187 lockowner_hash(void *key)
2189 int i;
2190 lock_owner4 *lockowner = key;
2191 uint_t hash = 0;
2193 for (i = 0; i < lockowner->owner_len; i++) {
2194 hash <<= 4;
2195 hash += (uint_t)lockowner->owner_val[i];
2197 hash += (uint_t)lockowner->clientid;
2198 hash |= (lockowner->clientid >> 32);
2200 return (hash);
2203 static uint32_t
2204 pid_hash(void *key)
2206 return ((uint32_t)(uintptr_t)key);
2209 static void *
2210 pid_mkkey(rfs4_entry_t u_entry)
2212 rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2214 return ((void *)(uintptr_t)lo->rl_pid);
2217 static bool_t
2218 pid_compare(rfs4_entry_t u_entry, void *key)
2220 rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2222 return (lo->rl_pid == (pid_t)(uintptr_t)key);
2225 static void
2226 rfs4_lockowner_destroy(rfs4_entry_t u_entry)
2228 rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2230 /* Free the lock owner id */
2231 kmem_free(lo->rl_owner.owner_val, lo->rl_owner.owner_len);
2232 rfs4_client_rele(lo->rl_client);
2235 void
2236 rfs4_lockowner_rele(rfs4_lockowner_t *lo)
2238 rfs4_dbe_rele(lo->rl_dbe);
2241 /* ARGSUSED */
2242 static bool_t
2243 rfs4_lockowner_expiry(rfs4_entry_t u_entry)
2246 * Since expiry is called with no other references on
2247 * this struct, go ahead and have it removed.
2249 return (TRUE);
2252 static bool_t
2253 rfs4_lockowner_create(rfs4_entry_t u_entry, void *arg)
2255 rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2256 lock_owner4 *lockowner = (lock_owner4 *)arg;
2257 rfs4_client_t *cp;
2258 bool_t create = FALSE;
2260 rw_enter(&rfs4_findclient_lock, RW_READER);
2262 cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx,
2263 &lockowner->clientid,
2264 &create, NULL, RFS4_DBS_VALID);
2266 rw_exit(&rfs4_findclient_lock);
2268 if (cp == NULL)
2269 return (FALSE);
2271 /* Reference client */
2272 lo->rl_client = cp;
2273 lo->rl_owner.clientid = lockowner->clientid;
2274 lo->rl_owner.owner_val = kmem_alloc(lockowner->owner_len, KM_SLEEP);
2275 bcopy(lockowner->owner_val, lo->rl_owner.owner_val,
2276 lockowner->owner_len);
2277 lo->rl_owner.owner_len = lockowner->owner_len;
2278 lo->rl_pid = rfs4_dbe_getid(lo->rl_dbe);
2280 return (TRUE);
2283 rfs4_lockowner_t *
2284 rfs4_findlockowner(lock_owner4 *lockowner, bool_t *create)
2286 rfs4_lockowner_t *lo;
2288 lo = (rfs4_lockowner_t *)rfs4_dbsearch(rfs4_lockowner_idx, lockowner,
2289 create, lockowner, RFS4_DBS_VALID);
2291 return (lo);
2294 rfs4_lockowner_t *
2295 rfs4_findlockowner_by_pid(pid_t pid)
2297 rfs4_lockowner_t *lo;
2298 bool_t create = FALSE;
2300 lo = (rfs4_lockowner_t *)rfs4_dbsearch(rfs4_lockowner_pid_idx,
2301 (void *)(uintptr_t)pid, &create, NULL, RFS4_DBS_VALID);
2303 return (lo);
2307 static uint32_t
2308 file_hash(void *key)
2310 return (ADDRHASH(key));
2313 static void *
2314 file_mkkey(rfs4_entry_t u_entry)
2316 rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2318 return (fp->rf_vp);
2321 static bool_t
2322 file_compare(rfs4_entry_t u_entry, void *key)
2324 rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2326 return (fp->rf_vp == (vnode_t *)key);
2329 static void
2330 rfs4_file_destroy(rfs4_entry_t u_entry)
2332 rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2334 list_destroy(&fp->rf_delegstatelist);
2336 if (fp->rf_filehandle.nfs_fh4_val)
2337 kmem_free(fp->rf_filehandle.nfs_fh4_val,
2338 fp->rf_filehandle.nfs_fh4_len);
2339 cv_destroy(fp->rf_dinfo.rd_recall_cv);
2340 if (fp->rf_vp) {
2341 vnode_t *vp = fp->rf_vp;
2343 mutex_enter(&vp->v_vsd_lock);
2344 (void) vsd_set(vp, nfs4_srv_vkey, NULL);
2345 mutex_exit(&vp->v_vsd_lock);
2346 VN_RELE(vp);
2347 fp->rf_vp = NULL;
2349 rw_destroy(&fp->rf_file_rwlock);
2353 * Used to unlock the underlying dbe struct only
2355 void
2356 rfs4_file_rele(rfs4_file_t *fp)
2358 rfs4_dbe_rele(fp->rf_dbe);
2361 typedef struct {
2362 vnode_t *vp;
2363 nfs_fh4 *fh;
2364 } rfs4_fcreate_arg;
2366 static bool_t
2367 rfs4_file_create(rfs4_entry_t u_entry, void *arg)
2369 rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2370 rfs4_fcreate_arg *ap = (rfs4_fcreate_arg *)arg;
2371 vnode_t *vp = ap->vp;
2372 nfs_fh4 *fh = ap->fh;
2374 VN_HOLD(vp);
2376 fp->rf_filehandle.nfs_fh4_len = 0;
2377 fp->rf_filehandle.nfs_fh4_val = NULL;
2378 ASSERT(fh && fh->nfs_fh4_len);
2379 if (fh && fh->nfs_fh4_len) {
2380 fp->rf_filehandle.nfs_fh4_val =
2381 kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2382 nfs_fh4_copy(fh, &fp->rf_filehandle);
2384 fp->rf_vp = vp;
2386 list_create(&fp->rf_delegstatelist, sizeof (rfs4_deleg_state_t),
2387 offsetof(rfs4_deleg_state_t, rds_node));
2389 fp->rf_share_deny = fp->rf_share_access = fp->rf_access_read = 0;
2390 fp->rf_access_write = fp->rf_deny_read = fp->rf_deny_write = 0;
2392 mutex_init(fp->rf_dinfo.rd_recall_lock, NULL, MUTEX_DEFAULT, NULL);
2393 cv_init(fp->rf_dinfo.rd_recall_cv, NULL, CV_DEFAULT, NULL);
2395 fp->rf_dinfo.rd_dtype = OPEN_DELEGATE_NONE;
2397 rw_init(&fp->rf_file_rwlock, NULL, RW_DEFAULT, NULL);
2399 mutex_enter(&vp->v_vsd_lock);
2400 VERIFY(vsd_set(vp, nfs4_srv_vkey, (void *)fp) == 0);
2401 mutex_exit(&vp->v_vsd_lock);
2403 return (TRUE);
2406 rfs4_file_t *
2407 rfs4_findfile(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2409 rfs4_file_t *fp;
2410 rfs4_fcreate_arg arg;
2412 arg.vp = vp;
2413 arg.fh = fh;
2415 if (*create == TRUE)
2416 fp = (rfs4_file_t *)rfs4_dbsearch(rfs4_file_idx, vp, create,
2417 &arg, RFS4_DBS_VALID);
2418 else {
2419 mutex_enter(&vp->v_vsd_lock);
2420 fp = (rfs4_file_t *)vsd_get(vp, nfs4_srv_vkey);
2421 if (fp) {
2422 rfs4_dbe_lock(fp->rf_dbe);
2423 if (rfs4_dbe_is_invalid(fp->rf_dbe) ||
2424 (rfs4_dbe_refcnt(fp->rf_dbe) == 0)) {
2425 rfs4_dbe_unlock(fp->rf_dbe);
2426 fp = NULL;
2427 } else {
2428 rfs4_dbe_hold(fp->rf_dbe);
2429 rfs4_dbe_unlock(fp->rf_dbe);
2432 mutex_exit(&vp->v_vsd_lock);
2434 return (fp);
2438 * Find a file in the db and once it is located, take the rw lock.
2439 * Need to check the vnode pointer and if it does not exist (it was
2440 * removed between the db location and check) redo the find. This
2441 * assumes that a file struct that has a NULL vnode pointer is marked
2442 * at 'invalid' and will not be found in the db the second time
2443 * around.
2445 rfs4_file_t *
2446 rfs4_findfile_withlock(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2448 rfs4_file_t *fp;
2449 rfs4_fcreate_arg arg;
2450 bool_t screate = *create;
2452 if (screate == FALSE) {
2453 mutex_enter(&vp->v_vsd_lock);
2454 fp = (rfs4_file_t *)vsd_get(vp, nfs4_srv_vkey);
2455 if (fp) {
2456 rfs4_dbe_lock(fp->rf_dbe);
2457 if (rfs4_dbe_is_invalid(fp->rf_dbe) ||
2458 (rfs4_dbe_refcnt(fp->rf_dbe) == 0)) {
2459 rfs4_dbe_unlock(fp->rf_dbe);
2460 mutex_exit(&vp->v_vsd_lock);
2461 fp = NULL;
2462 } else {
2463 rfs4_dbe_hold(fp->rf_dbe);
2464 rfs4_dbe_unlock(fp->rf_dbe);
2465 mutex_exit(&vp->v_vsd_lock);
2466 rw_enter(&fp->rf_file_rwlock, RW_WRITER);
2467 if (fp->rf_vp == NULL) {
2468 rw_exit(&fp->rf_file_rwlock);
2469 rfs4_file_rele(fp);
2470 fp = NULL;
2473 } else {
2474 mutex_exit(&vp->v_vsd_lock);
2476 } else {
2477 retry:
2478 arg.vp = vp;
2479 arg.fh = fh;
2481 fp = (rfs4_file_t *)rfs4_dbsearch(rfs4_file_idx, vp, create,
2482 &arg, RFS4_DBS_VALID);
2483 if (fp != NULL) {
2484 rw_enter(&fp->rf_file_rwlock, RW_WRITER);
2485 if (fp->rf_vp == NULL) {
2486 rw_exit(&fp->rf_file_rwlock);
2487 rfs4_file_rele(fp);
2488 *create = screate;
2489 goto retry;
2494 return (fp);
2497 static uint32_t
2498 lo_state_hash(void *key)
2500 stateid_t *id = key;
2502 return (id->bits.ident+id->bits.pid);
2505 static bool_t
2506 lo_state_compare(rfs4_entry_t u_entry, void *key)
2508 rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2509 stateid_t *id = key;
2510 bool_t rc;
2512 rc = (lsp->rls_lockid.bits.boottime == id->bits.boottime &&
2513 lsp->rls_lockid.bits.type == id->bits.type &&
2514 lsp->rls_lockid.bits.ident == id->bits.ident &&
2515 lsp->rls_lockid.bits.pid == id->bits.pid);
2517 return (rc);
2520 static void *
2521 lo_state_mkkey(rfs4_entry_t u_entry)
2523 rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2525 return (&lsp->rls_lockid);
2528 static bool_t
2529 rfs4_lo_state_expiry(rfs4_entry_t u_entry)
2531 rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2533 if (rfs4_dbe_is_invalid(lsp->rls_dbe))
2534 return (TRUE);
2535 if (lsp->rls_state->rs_closed)
2536 return (TRUE);
2537 return ((gethrestime_sec() -
2538 lsp->rls_state->rs_owner->ro_client->rc_last_access
2539 > rfs4_lease_time));
2542 static void
2543 rfs4_lo_state_destroy(rfs4_entry_t u_entry)
2545 rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2547 rfs4_dbe_lock(lsp->rls_state->rs_dbe);
2548 list_remove(&lsp->rls_state->rs_lostatelist, lsp);
2549 rfs4_dbe_unlock(lsp->rls_state->rs_dbe);
2551 rfs4_sw_destroy(&lsp->rls_sw);
2553 /* Make sure to release the file locks */
2554 if (lsp->rls_locks_cleaned == FALSE) {
2555 lsp->rls_locks_cleaned = TRUE;
2556 if (lsp->rls_locker->rl_client->rc_sysidt != LM_NOSYSID) {
2557 /* Is the PxFS kernel module loaded? */
2558 if (lm_remove_file_locks != NULL) {
2559 int new_sysid;
2561 /* Encode the cluster nodeid in new sysid */
2562 new_sysid =
2563 lsp->rls_locker->rl_client->rc_sysidt;
2564 lm_set_nlmid_flk(&new_sysid);
2567 * This PxFS routine removes file locks for a
2568 * client over all nodes of a cluster.
2570 DTRACE_PROBE1(nfss_i_clust_rm_lck,
2571 int, new_sysid);
2572 (*lm_remove_file_locks)(new_sysid);
2573 } else {
2574 (void) cleanlocks(
2575 lsp->rls_state->rs_finfo->rf_vp,
2576 lsp->rls_locker->rl_pid,
2577 lsp->rls_locker->rl_client->rc_sysidt);
2582 /* Free the last reply for this state */
2583 rfs4_free_reply(&lsp->rls_reply);
2585 rfs4_lockowner_rele(lsp->rls_locker);
2586 lsp->rls_locker = NULL;
2588 rfs4_state_rele_nounlock(lsp->rls_state);
2589 lsp->rls_state = NULL;
2592 static bool_t
2593 rfs4_lo_state_create(rfs4_entry_t u_entry, void *arg)
2595 rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2596 rfs4_lo_state_t *argp = (rfs4_lo_state_t *)arg;
2597 rfs4_lockowner_t *lo = argp->rls_locker;
2598 rfs4_state_t *sp = argp->rls_state;
2600 lsp->rls_state = sp;
2602 lsp->rls_lockid = sp->rs_stateid;
2603 lsp->rls_lockid.bits.type = LOCKID;
2604 lsp->rls_lockid.bits.chgseq = 0;
2605 lsp->rls_lockid.bits.pid = lo->rl_pid;
2607 lsp->rls_locks_cleaned = FALSE;
2608 lsp->rls_lock_completed = FALSE;
2610 rfs4_sw_init(&lsp->rls_sw);
2612 /* Attached the supplied lock owner */
2613 rfs4_dbe_hold(lo->rl_dbe);
2614 lsp->rls_locker = lo;
2616 rfs4_dbe_lock(sp->rs_dbe);
2617 list_insert_tail(&sp->rs_lostatelist, lsp);
2618 rfs4_dbe_hold(sp->rs_dbe);
2619 rfs4_dbe_unlock(sp->rs_dbe);
2621 return (TRUE);
2624 void
2625 rfs4_lo_state_rele(rfs4_lo_state_t *lsp, bool_t unlock_fp)
2627 if (unlock_fp == TRUE)
2628 rw_exit(&lsp->rls_state->rs_finfo->rf_file_rwlock);
2629 rfs4_dbe_rele(lsp->rls_dbe);
2632 static rfs4_lo_state_t *
2633 rfs4_findlo_state(stateid_t *id, bool_t lock_fp)
2635 rfs4_lo_state_t *lsp;
2636 bool_t create = FALSE;
2638 lsp = (rfs4_lo_state_t *)rfs4_dbsearch(rfs4_lo_state_idx, id,
2639 &create, NULL, RFS4_DBS_VALID);
2640 if (lock_fp == TRUE && lsp != NULL)
2641 rw_enter(&lsp->rls_state->rs_finfo->rf_file_rwlock, RW_READER);
2643 return (lsp);
2647 static uint32_t
2648 lo_state_lo_hash(void *key)
2650 rfs4_lo_state_t *lsp = key;
2652 return (ADDRHASH(lsp->rls_locker) ^ ADDRHASH(lsp->rls_state));
2655 static bool_t
2656 lo_state_lo_compare(rfs4_entry_t u_entry, void *key)
2658 rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2659 rfs4_lo_state_t *keyp = key;
2661 return (keyp->rls_locker == lsp->rls_locker &&
2662 keyp->rls_state == lsp->rls_state);
2665 static void *
2666 lo_state_lo_mkkey(rfs4_entry_t u_entry)
2668 return (u_entry);
2671 rfs4_lo_state_t *
2672 rfs4_findlo_state_by_owner(rfs4_lockowner_t *lo, rfs4_state_t *sp,
2673 bool_t *create)
2675 rfs4_lo_state_t *lsp;
2676 rfs4_lo_state_t arg;
2678 arg.rls_locker = lo;
2679 arg.rls_state = sp;
2681 lsp = (rfs4_lo_state_t *)rfs4_dbsearch(rfs4_lo_state_owner_idx, &arg,
2682 create, &arg, RFS4_DBS_VALID);
2684 return (lsp);
2687 static stateid_t
2688 get_stateid(id_t eid)
2690 stateid_t id;
2692 id.bits.boottime = rfs4_start_time;
2693 id.bits.ident = eid;
2694 id.bits.chgseq = 0;
2695 id.bits.type = 0;
2696 id.bits.pid = 0;
2697 id.bits.clnodeid = 0;
2699 return (id);
2702 static uint32_t
2703 state_hash(void *key)
2705 stateid_t *ip = (stateid_t *)key;
2707 return (ip->bits.ident);
2710 static bool_t
2711 state_compare(rfs4_entry_t u_entry, void *key)
2713 rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2714 stateid_t *id = (stateid_t *)key;
2715 bool_t rc;
2717 rc = (sp->rs_stateid.bits.boottime == id->bits.boottime &&
2718 sp->rs_stateid.bits.ident == id->bits.ident);
2720 return (rc);
2723 static void *
2724 state_mkkey(rfs4_entry_t u_entry)
2726 rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2728 return (&sp->rs_stateid);
2731 static void
2732 rfs4_state_destroy(rfs4_entry_t u_entry)
2734 rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2736 /* remove from openowner list */
2737 rfs4_dbe_lock(sp->rs_owner->ro_dbe);
2738 list_remove(&sp->rs_owner->ro_statelist, sp);
2739 rfs4_dbe_unlock(sp->rs_owner->ro_dbe);
2741 list_destroy(&sp->rs_lostatelist);
2743 /* release any share locks for this stateid if it's still open */
2744 if (!sp->rs_closed) {
2745 rfs4_dbe_lock(sp->rs_dbe);
2746 (void) rfs4_unshare(sp);
2747 rfs4_dbe_unlock(sp->rs_dbe);
2750 /* Were done with the file */
2751 rfs4_file_rele(sp->rs_finfo);
2752 sp->rs_finfo = NULL;
2754 /* And now with the openowner */
2755 rfs4_openowner_rele(sp->rs_owner);
2756 sp->rs_owner = NULL;
2759 static void
2760 rfs4_state_rele_nounlock(rfs4_state_t *sp)
2762 rfs4_dbe_rele(sp->rs_dbe);
2765 void
2766 rfs4_state_rele(rfs4_state_t *sp)
2768 rw_exit(&sp->rs_finfo->rf_file_rwlock);
2769 rfs4_dbe_rele(sp->rs_dbe);
2772 static uint32_t
2773 deleg_hash(void *key)
2775 rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)key;
2777 return (ADDRHASH(dsp->rds_client) ^ ADDRHASH(dsp->rds_finfo));
2780 static bool_t
2781 deleg_compare(rfs4_entry_t u_entry, void *key)
2783 rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2784 rfs4_deleg_state_t *kdsp = (rfs4_deleg_state_t *)key;
2786 return (dsp->rds_client == kdsp->rds_client &&
2787 dsp->rds_finfo == kdsp->rds_finfo);
2790 static void *
2791 deleg_mkkey(rfs4_entry_t u_entry)
2793 return (u_entry);
2796 static uint32_t
2797 deleg_state_hash(void *key)
2799 stateid_t *ip = (stateid_t *)key;
2801 return (ip->bits.ident);
2804 static bool_t
2805 deleg_state_compare(rfs4_entry_t u_entry, void *key)
2807 rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2808 stateid_t *id = (stateid_t *)key;
2809 bool_t rc;
2811 if (id->bits.type != DELEGID)
2812 return (FALSE);
2814 rc = (dsp->rds_delegid.bits.boottime == id->bits.boottime &&
2815 dsp->rds_delegid.bits.ident == id->bits.ident);
2817 return (rc);
2820 static void *
2821 deleg_state_mkkey(rfs4_entry_t u_entry)
2823 rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2825 return (&dsp->rds_delegid);
2828 static bool_t
2829 rfs4_deleg_state_expiry(rfs4_entry_t u_entry)
2831 rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2833 if (rfs4_dbe_is_invalid(dsp->rds_dbe))
2834 return (TRUE);
2836 if (dsp->rds_dtype == OPEN_DELEGATE_NONE)
2837 return (TRUE);
2839 if ((gethrestime_sec() - dsp->rds_client->rc_last_access
2840 > rfs4_lease_time)) {
2841 rfs4_dbe_invalidate(dsp->rds_dbe);
2842 return (TRUE);
2845 return (FALSE);
2848 static bool_t
2849 rfs4_deleg_state_create(rfs4_entry_t u_entry, void *argp)
2851 rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2852 rfs4_file_t *fp = ((rfs4_deleg_state_t *)argp)->rds_finfo;
2853 rfs4_client_t *cp = ((rfs4_deleg_state_t *)argp)->rds_client;
2855 rfs4_dbe_hold(fp->rf_dbe);
2856 rfs4_dbe_hold(cp->rc_dbe);
2858 dsp->rds_delegid = get_stateid(rfs4_dbe_getid(dsp->rds_dbe));
2859 dsp->rds_delegid.bits.type = DELEGID;
2860 dsp->rds_finfo = fp;
2861 dsp->rds_client = cp;
2862 dsp->rds_dtype = OPEN_DELEGATE_NONE;
2864 dsp->rds_time_granted = gethrestime_sec(); /* observability */
2865 dsp->rds_time_revoked = 0;
2867 list_link_init(&dsp->rds_node);
2869 return (TRUE);
2872 static void
2873 rfs4_deleg_state_destroy(rfs4_entry_t u_entry)
2875 rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2877 /* return delegation if necessary */
2878 rfs4_return_deleg(dsp, FALSE);
2880 /* Were done with the file */
2881 rfs4_file_rele(dsp->rds_finfo);
2882 dsp->rds_finfo = NULL;
2884 /* And now with the openowner */
2885 rfs4_client_rele(dsp->rds_client);
2886 dsp->rds_client = NULL;
2889 rfs4_deleg_state_t *
2890 rfs4_finddeleg(rfs4_state_t *sp, bool_t *create)
2892 rfs4_deleg_state_t ds, *dsp;
2894 ds.rds_client = sp->rs_owner->ro_client;
2895 ds.rds_finfo = sp->rs_finfo;
2897 dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(rfs4_deleg_idx, &ds,
2898 create, &ds, RFS4_DBS_VALID);
2900 return (dsp);
2903 rfs4_deleg_state_t *
2904 rfs4_finddelegstate(stateid_t *id)
2906 rfs4_deleg_state_t *dsp;
2907 bool_t create = FALSE;
2909 dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(rfs4_deleg_state_idx, id,
2910 &create, NULL, RFS4_DBS_VALID);
2912 return (dsp);
2915 void
2916 rfs4_deleg_state_rele(rfs4_deleg_state_t *dsp)
2918 rfs4_dbe_rele(dsp->rds_dbe);
2921 void
2922 rfs4_update_lock_sequence(rfs4_lo_state_t *lsp)
2925 rfs4_dbe_lock(lsp->rls_dbe);
2928 * If we are skipping sequence id checking, this means that
2929 * this is the first lock request and therefore the sequence
2930 * id does not need to be updated. This only happens on the
2931 * first lock request for a lockowner
2933 if (!lsp->rls_skip_seqid_check)
2934 lsp->rls_seqid++;
2936 rfs4_dbe_unlock(lsp->rls_dbe);
2939 void
2940 rfs4_update_lock_resp(rfs4_lo_state_t *lsp, nfs_resop4 *resp)
2943 rfs4_dbe_lock(lsp->rls_dbe);
2945 rfs4_free_reply(&lsp->rls_reply);
2947 rfs4_copy_reply(&lsp->rls_reply, resp);
2949 rfs4_dbe_unlock(lsp->rls_dbe);
2952 void
2953 rfs4_free_opens(rfs4_openowner_t *oo, bool_t invalidate,
2954 bool_t close_of_client)
2956 rfs4_state_t *sp;
2958 rfs4_dbe_lock(oo->ro_dbe);
2960 for (sp = list_head(&oo->ro_statelist); sp != NULL;
2961 sp = list_next(&oo->ro_statelist, sp)) {
2962 rfs4_state_close(sp, FALSE, close_of_client, CRED());
2963 if (invalidate == TRUE)
2964 rfs4_dbe_invalidate(sp->rs_dbe);
2967 rfs4_dbe_invalidate(oo->ro_dbe);
2968 rfs4_dbe_unlock(oo->ro_dbe);
2971 static uint32_t
2972 state_owner_file_hash(void *key)
2974 rfs4_state_t *sp = key;
2976 return (ADDRHASH(sp->rs_owner) ^ ADDRHASH(sp->rs_finfo));
2979 static bool_t
2980 state_owner_file_compare(rfs4_entry_t u_entry, void *key)
2982 rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2983 rfs4_state_t *arg = key;
2985 if (sp->rs_closed == TRUE)
2986 return (FALSE);
2988 return (arg->rs_owner == sp->rs_owner && arg->rs_finfo == sp->rs_finfo);
2991 static void *
2992 state_owner_file_mkkey(rfs4_entry_t u_entry)
2994 return (u_entry);
2997 static uint32_t
2998 state_file_hash(void *key)
3000 return (ADDRHASH(key));
3003 static bool_t
3004 state_file_compare(rfs4_entry_t u_entry, void *key)
3006 rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3007 rfs4_file_t *fp = key;
3009 if (sp->rs_closed == TRUE)
3010 return (FALSE);
3012 return (fp == sp->rs_finfo);
3015 static void *
3016 state_file_mkkey(rfs4_entry_t u_entry)
3018 rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3020 return (sp->rs_finfo);
3023 rfs4_state_t *
3024 rfs4_findstate_by_owner_file(rfs4_openowner_t *oo, rfs4_file_t *fp,
3025 bool_t *create)
3027 rfs4_state_t *sp;
3028 rfs4_state_t key;
3030 key.rs_owner = oo;
3031 key.rs_finfo = fp;
3033 sp = (rfs4_state_t *)rfs4_dbsearch(rfs4_state_owner_file_idx, &key,
3034 create, &key, RFS4_DBS_VALID);
3036 return (sp);
3039 /* This returns ANY state struct that refers to this file */
3040 static rfs4_state_t *
3041 rfs4_findstate_by_file(rfs4_file_t *fp)
3043 bool_t create = FALSE;
3045 return ((rfs4_state_t *)rfs4_dbsearch(rfs4_state_file_idx, fp,
3046 &create, fp, RFS4_DBS_VALID));
3049 static bool_t
3050 rfs4_state_expiry(rfs4_entry_t u_entry)
3052 rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3054 if (rfs4_dbe_is_invalid(sp->rs_dbe))
3055 return (TRUE);
3057 if (sp->rs_closed == TRUE &&
3058 ((gethrestime_sec() - rfs4_dbe_get_timerele(sp->rs_dbe))
3059 > rfs4_lease_time))
3060 return (TRUE);
3062 return ((gethrestime_sec() - sp->rs_owner->ro_client->rc_last_access
3063 > rfs4_lease_time));
3066 static bool_t
3067 rfs4_state_create(rfs4_entry_t u_entry, void *argp)
3069 rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3070 rfs4_file_t *fp = ((rfs4_state_t *)argp)->rs_finfo;
3071 rfs4_openowner_t *oo = ((rfs4_state_t *)argp)->rs_owner;
3073 rfs4_dbe_hold(fp->rf_dbe);
3074 rfs4_dbe_hold(oo->ro_dbe);
3075 sp->rs_stateid = get_stateid(rfs4_dbe_getid(sp->rs_dbe));
3076 sp->rs_stateid.bits.type = OPENID;
3077 sp->rs_owner = oo;
3078 sp->rs_finfo = fp;
3080 list_create(&sp->rs_lostatelist, sizeof (rfs4_lo_state_t),
3081 offsetof(rfs4_lo_state_t, rls_node));
3083 /* Insert state on per open owner's list */
3084 rfs4_dbe_lock(oo->ro_dbe);
3085 list_insert_tail(&oo->ro_statelist, sp);
3086 rfs4_dbe_unlock(oo->ro_dbe);
3088 return (TRUE);
3091 static rfs4_state_t *
3092 rfs4_findstate(stateid_t *id, rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
3094 rfs4_state_t *sp;
3095 bool_t create = FALSE;
3097 sp = (rfs4_state_t *)rfs4_dbsearch(rfs4_state_idx, id,
3098 &create, NULL, find_invalid);
3099 if (lock_fp == TRUE && sp != NULL)
3100 rw_enter(&sp->rs_finfo->rf_file_rwlock, RW_READER);
3102 return (sp);
3105 void
3106 rfs4_state_close(rfs4_state_t *sp, bool_t lock_held, bool_t close_of_client,
3107 cred_t *cr)
3109 /* Remove the associated lo_state owners */
3110 if (!lock_held)
3111 rfs4_dbe_lock(sp->rs_dbe);
3114 * If refcnt == 0, the dbe is about to be destroyed.
3115 * lock state will be released by the reaper thread.
3118 if (rfs4_dbe_refcnt(sp->rs_dbe) > 0) {
3119 if (sp->rs_closed == FALSE) {
3120 rfs4_release_share_lock_state(sp, cr, close_of_client);
3121 sp->rs_closed = TRUE;
3125 if (!lock_held)
3126 rfs4_dbe_unlock(sp->rs_dbe);
3130 * Remove all state associated with the given client.
3132 void
3133 rfs4_client_state_remove(rfs4_client_t *cp)
3135 rfs4_openowner_t *oo;
3137 rfs4_dbe_lock(cp->rc_dbe);
3139 for (oo = list_head(&cp->rc_openownerlist); oo != NULL;
3140 oo = list_next(&cp->rc_openownerlist, oo)) {
3141 rfs4_free_opens(oo, TRUE, TRUE);
3144 rfs4_dbe_unlock(cp->rc_dbe);
3147 void
3148 rfs4_client_close(rfs4_client_t *cp)
3150 /* Mark client as going away. */
3151 rfs4_dbe_lock(cp->rc_dbe);
3152 rfs4_dbe_invalidate(cp->rc_dbe);
3153 rfs4_dbe_unlock(cp->rc_dbe);
3155 rfs4_client_state_remove(cp);
3157 /* Release the client */
3158 rfs4_client_rele(cp);
3161 nfsstat4
3162 rfs4_check_clientid(clientid4 *cp, int setclid_confirm)
3164 cid *cidp = (cid *) cp;
3167 * If the server start time matches the time provided
3168 * by the client (via the clientid) and this is NOT a
3169 * setclientid_confirm then return EXPIRED.
3171 if (!setclid_confirm && cidp->impl_id.start_time == rfs4_start_time)
3172 return (NFS4ERR_EXPIRED);
3174 return (NFS4ERR_STALE_CLIENTID);
3178 * This is used when a stateid has not been found amongst the
3179 * current server's state. Check the stateid to see if it
3180 * was from this server instantiation or not.
3182 static nfsstat4
3183 what_stateid_error(stateid_t *id, stateid_type_t type)
3185 /* If types don't match then no use checking further */
3186 if (type != id->bits.type)
3187 return (NFS4ERR_BAD_STATEID);
3189 /* From a different server instantiation, return STALE */
3190 if (id->bits.boottime != rfs4_start_time)
3191 return (NFS4ERR_STALE_STATEID);
3194 * From this server but the state is most likely beyond lease
3195 * timeout: return NFS4ERR_EXPIRED. However, there is the
3196 * case of a delegation stateid. For delegations, there is a
3197 * case where the state can be removed without the client's
3198 * knowledge/consent: revocation. In the case of delegation
3199 * revocation, the delegation state will be removed and will
3200 * not be found. If the client does something like a
3201 * DELEGRETURN or even a READ/WRITE with a delegatoin stateid
3202 * that has been revoked, the server should return BAD_STATEID
3203 * instead of the more common EXPIRED error.
3205 if (id->bits.boottime == rfs4_start_time) {
3206 if (type == DELEGID)
3207 return (NFS4ERR_BAD_STATEID);
3208 else
3209 return (NFS4ERR_EXPIRED);
3212 return (NFS4ERR_BAD_STATEID);
3216 * Used later on to find the various state structs. When called from
3217 * rfs4_check_stateid()->rfs4_get_all_state(), no file struct lock is
3218 * taken (it is not needed) and helps on the read/write path with
3219 * respect to performance.
3221 static nfsstat4
3222 rfs4_get_state_lockit(stateid4 *stateid, rfs4_state_t **spp,
3223 rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
3225 stateid_t *id = (stateid_t *)stateid;
3226 rfs4_state_t *sp;
3228 *spp = NULL;
3230 sp = rfs4_findstate(id, find_invalid, lock_fp);
3231 if (sp == NULL) {
3232 return (what_stateid_error(id, OPENID));
3235 if (rfs4_lease_expired(sp->rs_owner->ro_client)) {
3236 if (lock_fp == TRUE)
3237 rfs4_state_rele(sp);
3238 else
3239 rfs4_state_rele_nounlock(sp);
3240 return (NFS4ERR_EXPIRED);
3243 *spp = sp;
3245 return (NFS4_OK);
3248 nfsstat4
3249 rfs4_get_state(stateid4 *stateid, rfs4_state_t **spp,
3250 rfs4_dbsearch_type_t find_invalid)
3252 return (rfs4_get_state_lockit(stateid, spp, find_invalid, TRUE));
3256 rfs4_check_stateid_seqid(rfs4_state_t *sp, stateid4 *stateid)
3258 stateid_t *id = (stateid_t *)stateid;
3260 if (rfs4_lease_expired(sp->rs_owner->ro_client))
3261 return (NFS4_CHECK_STATEID_EXPIRED);
3263 /* Stateid is some time in the future - that's bad */
3264 if (sp->rs_stateid.bits.chgseq < id->bits.chgseq)
3265 return (NFS4_CHECK_STATEID_BAD);
3267 if (sp->rs_stateid.bits.chgseq == id->bits.chgseq + 1)
3268 return (NFS4_CHECK_STATEID_REPLAY);
3270 /* Stateid is some time in the past - that's old */
3271 if (sp->rs_stateid.bits.chgseq > id->bits.chgseq)
3272 return (NFS4_CHECK_STATEID_OLD);
3274 /* Caller needs to know about confirmation before closure */
3275 if (sp->rs_owner->ro_need_confirm)
3276 return (NFS4_CHECK_STATEID_UNCONFIRMED);
3278 if (sp->rs_closed == TRUE)
3279 return (NFS4_CHECK_STATEID_CLOSED);
3281 return (NFS4_CHECK_STATEID_OKAY);
3285 rfs4_check_lo_stateid_seqid(rfs4_lo_state_t *lsp, stateid4 *stateid)
3287 stateid_t *id = (stateid_t *)stateid;
3289 if (rfs4_lease_expired(lsp->rls_state->rs_owner->ro_client))
3290 return (NFS4_CHECK_STATEID_EXPIRED);
3292 /* Stateid is some time in the future - that's bad */
3293 if (lsp->rls_lockid.bits.chgseq < id->bits.chgseq)
3294 return (NFS4_CHECK_STATEID_BAD);
3296 if (lsp->rls_lockid.bits.chgseq == id->bits.chgseq + 1)
3297 return (NFS4_CHECK_STATEID_REPLAY);
3299 /* Stateid is some time in the past - that's old */
3300 if (lsp->rls_lockid.bits.chgseq > id->bits.chgseq)
3301 return (NFS4_CHECK_STATEID_OLD);
3303 if (lsp->rls_state->rs_closed == TRUE)
3304 return (NFS4_CHECK_STATEID_CLOSED);
3306 return (NFS4_CHECK_STATEID_OKAY);
3309 nfsstat4
3310 rfs4_get_deleg_state(stateid4 *stateid, rfs4_deleg_state_t **dspp)
3312 stateid_t *id = (stateid_t *)stateid;
3313 rfs4_deleg_state_t *dsp;
3315 *dspp = NULL;
3317 dsp = rfs4_finddelegstate(id);
3318 if (dsp == NULL) {
3319 return (what_stateid_error(id, DELEGID));
3322 if (rfs4_lease_expired(dsp->rds_client)) {
3323 rfs4_deleg_state_rele(dsp);
3324 return (NFS4ERR_EXPIRED);
3327 *dspp = dsp;
3329 return (NFS4_OK);
3332 nfsstat4
3333 rfs4_get_lo_state(stateid4 *stateid, rfs4_lo_state_t **lspp, bool_t lock_fp)
3335 stateid_t *id = (stateid_t *)stateid;
3336 rfs4_lo_state_t *lsp;
3338 *lspp = NULL;
3340 lsp = rfs4_findlo_state(id, lock_fp);
3341 if (lsp == NULL) {
3342 return (what_stateid_error(id, LOCKID));
3345 if (rfs4_lease_expired(lsp->rls_state->rs_owner->ro_client)) {
3346 rfs4_lo_state_rele(lsp, lock_fp);
3347 return (NFS4ERR_EXPIRED);
3350 *lspp = lsp;
3352 return (NFS4_OK);
3355 static nfsstat4
3356 rfs4_get_all_state(stateid4 *sid, rfs4_state_t **spp,
3357 rfs4_deleg_state_t **dspp, rfs4_lo_state_t **lspp)
3359 rfs4_state_t *sp = NULL;
3360 rfs4_deleg_state_t *dsp = NULL;
3361 rfs4_lo_state_t *lsp = NULL;
3362 stateid_t *id;
3363 nfsstat4 status;
3365 *spp = NULL; *dspp = NULL; *lspp = NULL;
3367 id = (stateid_t *)sid;
3368 switch (id->bits.type) {
3369 case OPENID:
3370 status = rfs4_get_state_lockit(sid, &sp, FALSE, FALSE);
3371 break;
3372 case DELEGID:
3373 status = rfs4_get_deleg_state(sid, &dsp);
3374 break;
3375 case LOCKID:
3376 status = rfs4_get_lo_state(sid, &lsp, FALSE);
3377 if (status == NFS4_OK) {
3378 sp = lsp->rls_state;
3379 rfs4_dbe_hold(sp->rs_dbe);
3381 break;
3382 default:
3383 status = NFS4ERR_BAD_STATEID;
3386 if (status == NFS4_OK) {
3387 *spp = sp;
3388 *dspp = dsp;
3389 *lspp = lsp;
3392 return (status);
3396 * Given the I/O mode (FREAD or FWRITE), this checks whether the
3397 * rfs4_state_t struct has access to do this operation and if so
3398 * return NFS4_OK; otherwise the proper NFSv4 error is returned.
3400 nfsstat4
3401 rfs4_state_has_access(rfs4_state_t *sp, int mode, vnode_t *vp)
3403 nfsstat4 stat = NFS4_OK;
3404 rfs4_file_t *fp;
3405 bool_t create = FALSE;
3407 rfs4_dbe_lock(sp->rs_dbe);
3408 if (mode == FWRITE) {
3409 if (!(sp->rs_share_access & OPEN4_SHARE_ACCESS_WRITE)) {
3410 stat = NFS4ERR_OPENMODE;
3412 } else if (mode == FREAD) {
3413 if (!(sp->rs_share_access & OPEN4_SHARE_ACCESS_READ)) {
3415 * If we have OPENed the file with DENYing access
3416 * to both READ and WRITE then no one else could
3417 * have OPENed the file, hence no conflicting READ
3418 * deny. This check is merely an optimization.
3420 if (sp->rs_share_deny == OPEN4_SHARE_DENY_BOTH)
3421 goto out;
3423 /* Check against file struct's DENY mode */
3424 fp = rfs4_findfile(vp, NULL, &create);
3425 if (fp != NULL) {
3426 int deny_read = 0;
3427 rfs4_dbe_lock(fp->rf_dbe);
3429 * Check if any other open owner has the file
3430 * OPENed with deny READ.
3432 if (sp->rs_share_deny & OPEN4_SHARE_DENY_READ)
3433 deny_read = 1;
3434 ASSERT(fp->rf_deny_read >= deny_read);
3435 if (fp->rf_deny_read > deny_read)
3436 stat = NFS4ERR_OPENMODE;
3437 rfs4_dbe_unlock(fp->rf_dbe);
3438 rfs4_file_rele(fp);
3441 } else {
3442 /* Illegal I/O mode */
3443 stat = NFS4ERR_INVAL;
3445 out:
3446 rfs4_dbe_unlock(sp->rs_dbe);
3447 return (stat);
3451 * Given the I/O mode (FREAD or FWRITE), the vnode, the stateid and whether
3452 * the file is being truncated, return NFS4_OK if allowed or appropriate
3453 * V4 error if not. Note NFS4ERR_DELAY will be returned and a recall on
3454 * the associated file will be done if the I/O is not consistent with any
3455 * delegation in effect on the file. Should be holding fop_rwlock, either
3456 * as reader or writer as appropriate. rfs4_op_open will acquire the
3457 * fop_rwlock as writer when setting up delegation. If the stateid is bad
3458 * this routine will return NFS4ERR_BAD_STATEID. In addition, through the
3459 * deleg parameter, we will return whether a write delegation is held by
3460 * the client associated with this stateid.
3461 * If the server instance associated with the relevant client is in its
3462 * grace period, return NFS4ERR_GRACE.
3465 nfsstat4
3466 rfs4_check_stateid(int mode, vnode_t *vp,
3467 stateid4 *stateid, bool_t trunc, bool_t *deleg,
3468 bool_t do_access, caller_context_t *ct)
3470 rfs4_file_t *fp;
3471 bool_t create = FALSE;
3472 rfs4_state_t *sp;
3473 rfs4_deleg_state_t *dsp;
3474 rfs4_lo_state_t *lsp;
3475 stateid_t *id = (stateid_t *)stateid;
3476 nfsstat4 stat = NFS4_OK;
3478 if (ct != NULL) {
3479 ct->cc_sysid = 0;
3480 ct->cc_pid = 0;
3481 ct->cc_caller_id = nfs4_srv_caller_id;
3482 ct->cc_flags = CC_DONTBLOCK;
3485 if (ISSPECIAL(stateid)) {
3486 fp = rfs4_findfile(vp, NULL, &create);
3487 if (fp == NULL)
3488 return (NFS4_OK);
3489 if (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_NONE) {
3490 rfs4_file_rele(fp);
3491 return (NFS4_OK);
3493 if (mode == FWRITE ||
3494 fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_WRITE) {
3495 rfs4_recall_deleg(fp, trunc, NULL);
3496 rfs4_file_rele(fp);
3497 return (NFS4ERR_DELAY);
3499 rfs4_file_rele(fp);
3500 return (NFS4_OK);
3501 } else {
3502 stat = rfs4_get_all_state(stateid, &sp, &dsp, &lsp);
3503 if (stat != NFS4_OK)
3504 return (stat);
3505 if (lsp != NULL) {
3506 /* Is associated server instance in its grace period? */
3507 if (rfs4_clnt_in_grace(lsp->rls_locker->rl_client)) {
3508 rfs4_lo_state_rele(lsp, FALSE);
3509 if (sp != NULL)
3510 rfs4_state_rele_nounlock(sp);
3511 return (NFS4ERR_GRACE);
3513 if (id->bits.type == LOCKID) {
3514 /* Seqid in the future? - that's bad */
3515 if (lsp->rls_lockid.bits.chgseq <
3516 id->bits.chgseq) {
3517 rfs4_lo_state_rele(lsp, FALSE);
3518 if (sp != NULL)
3519 rfs4_state_rele_nounlock(sp);
3520 return (NFS4ERR_BAD_STATEID);
3522 /* Seqid in the past? - that's old */
3523 if (lsp->rls_lockid.bits.chgseq >
3524 id->bits.chgseq) {
3525 rfs4_lo_state_rele(lsp, FALSE);
3526 if (sp != NULL)
3527 rfs4_state_rele_nounlock(sp);
3528 return (NFS4ERR_OLD_STATEID);
3530 /* Ensure specified filehandle matches */
3531 if (lsp->rls_state->rs_finfo->rf_vp != vp) {
3532 rfs4_lo_state_rele(lsp, FALSE);
3533 if (sp != NULL)
3534 rfs4_state_rele_nounlock(sp);
3535 return (NFS4ERR_BAD_STATEID);
3538 if (ct != NULL) {
3539 ct->cc_sysid =
3540 lsp->rls_locker->rl_client->rc_sysidt;
3541 ct->cc_pid = lsp->rls_locker->rl_pid;
3543 rfs4_lo_state_rele(lsp, FALSE);
3546 /* Stateid provided was an "open" stateid */
3547 if (sp != NULL) {
3548 /* Is associated server instance in its grace period? */
3549 if (rfs4_clnt_in_grace(sp->rs_owner->ro_client)) {
3550 rfs4_state_rele_nounlock(sp);
3551 return (NFS4ERR_GRACE);
3553 if (id->bits.type == OPENID) {
3554 /* Seqid in the future? - that's bad */
3555 if (sp->rs_stateid.bits.chgseq <
3556 id->bits.chgseq) {
3557 rfs4_state_rele_nounlock(sp);
3558 return (NFS4ERR_BAD_STATEID);
3560 /* Seqid in the past - that's old */
3561 if (sp->rs_stateid.bits.chgseq >
3562 id->bits.chgseq) {
3563 rfs4_state_rele_nounlock(sp);
3564 return (NFS4ERR_OLD_STATEID);
3567 /* Ensure specified filehandle matches */
3568 if (sp->rs_finfo->rf_vp != vp) {
3569 rfs4_state_rele_nounlock(sp);
3570 return (NFS4ERR_BAD_STATEID);
3573 if (sp->rs_owner->ro_need_confirm) {
3574 rfs4_state_rele_nounlock(sp);
3575 return (NFS4ERR_BAD_STATEID);
3578 if (sp->rs_closed == TRUE) {
3579 rfs4_state_rele_nounlock(sp);
3580 return (NFS4ERR_OLD_STATEID);
3583 if (do_access)
3584 stat = rfs4_state_has_access(sp, mode, vp);
3585 else
3586 stat = NFS4_OK;
3589 * Return whether this state has write
3590 * delegation if desired
3592 if (deleg && (sp->rs_finfo->rf_dinfo.rd_dtype ==
3593 OPEN_DELEGATE_WRITE))
3594 *deleg = TRUE;
3597 * We got a valid stateid, so we update the
3598 * lease on the client. Ideally we would like
3599 * to do this after the calling op succeeds,
3600 * but for now this will be good
3601 * enough. Callers of this routine are
3602 * currently insulated from the state stuff.
3604 rfs4_update_lease(sp->rs_owner->ro_client);
3607 * If a delegation is present on this file and
3608 * this is a WRITE, then update the lastwrite
3609 * time to indicate that activity is present.
3611 if (sp->rs_finfo->rf_dinfo.rd_dtype ==
3612 OPEN_DELEGATE_WRITE &&
3613 mode == FWRITE) {
3614 sp->rs_finfo->rf_dinfo.rd_time_lastwrite =
3615 gethrestime_sec();
3618 rfs4_state_rele_nounlock(sp);
3620 return (stat);
3623 if (dsp != NULL) {
3624 /* Is associated server instance in its grace period? */
3625 if (rfs4_clnt_in_grace(dsp->rds_client)) {
3626 rfs4_deleg_state_rele(dsp);
3627 return (NFS4ERR_GRACE);
3629 if (dsp->rds_delegid.bits.chgseq != id->bits.chgseq) {
3630 rfs4_deleg_state_rele(dsp);
3631 return (NFS4ERR_BAD_STATEID);
3634 /* Ensure specified filehandle matches */
3635 if (dsp->rds_finfo->rf_vp != vp) {
3636 rfs4_deleg_state_rele(dsp);
3637 return (NFS4ERR_BAD_STATEID);
3640 * Return whether this state has write
3641 * delegation if desired
3643 if (deleg && (dsp->rds_finfo->rf_dinfo.rd_dtype ==
3644 OPEN_DELEGATE_WRITE))
3645 *deleg = TRUE;
3647 rfs4_update_lease(dsp->rds_client);
3650 * If a delegation is present on this file and
3651 * this is a WRITE, then update the lastwrite
3652 * time to indicate that activity is present.
3654 if (dsp->rds_finfo->rf_dinfo.rd_dtype ==
3655 OPEN_DELEGATE_WRITE && mode == FWRITE) {
3656 dsp->rds_finfo->rf_dinfo.rd_time_lastwrite =
3657 gethrestime_sec();
3661 * XXX - what happens if this is a WRITE and the
3662 * delegation type of for READ.
3664 rfs4_deleg_state_rele(dsp);
3666 return (stat);
3669 * If we got this far, something bad happened
3671 return (NFS4ERR_BAD_STATEID);
3677 * This is a special function in that for the file struct provided the
3678 * server wants to remove/close all current state associated with the
3679 * file. The prime use of this would be with OP_REMOVE to force the
3680 * release of state and particularly of file locks.
3682 * There is an assumption that there is no delegations outstanding on
3683 * this file at this point. The caller should have waited for those
3684 * to be returned or revoked.
3686 void
3687 rfs4_close_all_state(rfs4_file_t *fp)
3689 rfs4_state_t *sp;
3691 rfs4_dbe_lock(fp->rf_dbe);
3693 #ifdef DEBUG
3694 /* only applies when server is handing out delegations */
3695 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE)
3696 ASSERT(fp->rf_dinfo.rd_hold_grant > 0);
3697 #endif
3699 /* No delegations for this file */
3700 ASSERT(list_is_empty(&fp->rf_delegstatelist));
3702 /* Make sure that it can not be found */
3703 rfs4_dbe_invalidate(fp->rf_dbe);
3705 if (fp->rf_vp == NULL) {
3706 rfs4_dbe_unlock(fp->rf_dbe);
3707 return;
3709 rfs4_dbe_unlock(fp->rf_dbe);
3712 * Hold as writer to prevent other server threads from
3713 * processing requests related to the file while all state is
3714 * being removed.
3716 rw_enter(&fp->rf_file_rwlock, RW_WRITER);
3718 /* Remove ALL state from the file */
3719 while (sp = rfs4_findstate_by_file(fp)) {
3720 rfs4_state_close(sp, FALSE, FALSE, CRED());
3721 rfs4_state_rele_nounlock(sp);
3725 * This is only safe since there are no further references to
3726 * the file.
3728 rfs4_dbe_lock(fp->rf_dbe);
3729 if (fp->rf_vp) {
3730 vnode_t *vp = fp->rf_vp;
3732 mutex_enter(&vp->v_vsd_lock);
3733 (void) vsd_set(vp, nfs4_srv_vkey, NULL);
3734 mutex_exit(&vp->v_vsd_lock);
3735 VN_RELE(vp);
3736 fp->rf_vp = NULL;
3738 rfs4_dbe_unlock(fp->rf_dbe);
3740 /* Finally let other references to proceed */
3741 rw_exit(&fp->rf_file_rwlock);
3745 * This function is used as a target for the rfs4_dbe_walk() call
3746 * below. The purpose of this function is to see if the
3747 * lockowner_state refers to a file that resides within the exportinfo
3748 * export. If so, then remove the lock_owner state (file locks and
3749 * share "locks") for this object since the intent is the server is
3750 * unexporting the specified directory. Be sure to invalidate the
3751 * object after the state has been released
3753 static void
3754 rfs4_lo_state_walk_callout(rfs4_entry_t u_entry, void *e)
3756 rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
3757 struct exportinfo *exi = (struct exportinfo *)e;
3758 nfs_fh4_fmt_t fhfmt4, *exi_fhp, *finfo_fhp;
3759 fhandle_t *efhp;
3761 efhp = (fhandle_t *)&exi->exi_fh;
3762 exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3764 FH_TO_FMT4(efhp, exi_fhp);
3766 finfo_fhp = (nfs_fh4_fmt_t *)lsp->rls_state->rs_finfo->
3767 rf_filehandle.nfs_fh4_val;
3769 if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3770 bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3771 exi_fhp->fh4_xlen) == 0) {
3772 rfs4_state_close(lsp->rls_state, FALSE, FALSE, CRED());
3773 rfs4_dbe_invalidate(lsp->rls_dbe);
3774 rfs4_dbe_invalidate(lsp->rls_state->rs_dbe);
3779 * This function is used as a target for the rfs4_dbe_walk() call
3780 * below. The purpose of this function is to see if the state refers
3781 * to a file that resides within the exportinfo export. If so, then
3782 * remove the open state for this object since the intent is the
3783 * server is unexporting the specified directory. The main result for
3784 * this type of entry is to invalidate it such it will not be found in
3785 * the future.
3787 static void
3788 rfs4_state_walk_callout(rfs4_entry_t u_entry, void *e)
3790 rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3791 struct exportinfo *exi = (struct exportinfo *)e;
3792 nfs_fh4_fmt_t fhfmt4, *exi_fhp, *finfo_fhp;
3793 fhandle_t *efhp;
3795 efhp = (fhandle_t *)&exi->exi_fh;
3796 exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3798 FH_TO_FMT4(efhp, exi_fhp);
3800 finfo_fhp =
3801 (nfs_fh4_fmt_t *)sp->rs_finfo->rf_filehandle.nfs_fh4_val;
3803 if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3804 bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3805 exi_fhp->fh4_xlen) == 0) {
3806 rfs4_state_close(sp, TRUE, FALSE, CRED());
3807 rfs4_dbe_invalidate(sp->rs_dbe);
3812 * This function is used as a target for the rfs4_dbe_walk() call
3813 * below. The purpose of this function is to see if the state refers
3814 * to a file that resides within the exportinfo export. If so, then
3815 * remove the deleg state for this object since the intent is the
3816 * server is unexporting the specified directory. The main result for
3817 * this type of entry is to invalidate it such it will not be found in
3818 * the future.
3820 static void
3821 rfs4_deleg_state_walk_callout(rfs4_entry_t u_entry, void *e)
3823 rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3824 struct exportinfo *exi = (struct exportinfo *)e;
3825 nfs_fh4_fmt_t fhfmt4, *exi_fhp, *finfo_fhp;
3826 fhandle_t *efhp;
3828 efhp = (fhandle_t *)&exi->exi_fh;
3829 exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3831 FH_TO_FMT4(efhp, exi_fhp);
3833 finfo_fhp =
3834 (nfs_fh4_fmt_t *)dsp->rds_finfo->rf_filehandle.nfs_fh4_val;
3836 if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3837 bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3838 exi_fhp->fh4_xlen) == 0) {
3839 rfs4_dbe_invalidate(dsp->rds_dbe);
3844 * This function is used as a target for the rfs4_dbe_walk() call
3845 * below. The purpose of this function is to see if the state refers
3846 * to a file that resides within the exportinfo export. If so, then
3847 * release vnode hold for this object since the intent is the server
3848 * is unexporting the specified directory. Invalidation will prevent
3849 * this struct from being found in the future.
3851 static void
3852 rfs4_file_walk_callout(rfs4_entry_t u_entry, void *e)
3854 rfs4_file_t *fp = (rfs4_file_t *)u_entry;
3855 struct exportinfo *exi = (struct exportinfo *)e;
3856 nfs_fh4_fmt_t fhfmt4, *exi_fhp, *finfo_fhp;
3857 fhandle_t *efhp;
3859 efhp = (fhandle_t *)&exi->exi_fh;
3860 exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3862 FH_TO_FMT4(efhp, exi_fhp);
3864 finfo_fhp = (nfs_fh4_fmt_t *)fp->rf_filehandle.nfs_fh4_val;
3866 if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3867 bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3868 exi_fhp->fh4_xlen) == 0) {
3869 if (fp->rf_vp) {
3870 vnode_t *vp = fp->rf_vp;
3873 * don't leak monitors and remove the reference
3874 * put on the vnode when the delegation was granted.
3876 if (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_READ) {
3877 (void) fem_uninstall(vp, &deleg_rdops, fp);
3878 vn_open_downgrade(vp, FREAD);
3879 } else if (fp->rf_dinfo.rd_dtype ==
3880 OPEN_DELEGATE_WRITE) {
3881 (void) fem_uninstall(vp, &deleg_wrops, fp);
3882 vn_open_downgrade(vp, FREAD|FWRITE);
3884 mutex_enter(&vp->v_vsd_lock);
3885 (void) vsd_set(vp, nfs4_srv_vkey, NULL);
3886 mutex_exit(&vp->v_vsd_lock);
3887 VN_RELE(vp);
3888 fp->rf_vp = NULL;
3890 rfs4_dbe_invalidate(fp->rf_dbe);
3895 * Given a directory that is being unexported, cleanup/release all
3896 * state in the server that refers to objects residing underneath this
3897 * particular export. The ordering of the release is important.
3898 * Lock_owner, then state and then file.
3900 void
3901 rfs4_clean_state_exi(struct exportinfo *exi)
3903 mutex_enter(&rfs4_state_lock);
3905 if (rfs4_server_state == NULL) {
3906 mutex_exit(&rfs4_state_lock);
3907 return;
3910 rfs4_dbe_walk(rfs4_lo_state_tab, rfs4_lo_state_walk_callout, exi);
3911 rfs4_dbe_walk(rfs4_state_tab, rfs4_state_walk_callout, exi);
3912 rfs4_dbe_walk(rfs4_deleg_state_tab, rfs4_deleg_state_walk_callout, exi);
3913 rfs4_dbe_walk(rfs4_file_tab, rfs4_file_walk_callout, exi);
3915 mutex_exit(&rfs4_state_lock);