2986 nfs: exi refcounter leak at rfs3_lookup
[unleashed.git] / usr / src / uts / common / fs / nfs / nfs_srv.c
blobf0cd9633aa157f9cd61889af4ac761d8e658b88f
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
28 * All rights reserved.
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/buf.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/uio.h>
39 #include <sys/stat.h>
40 #include <sys/errno.h>
41 #include <sys/sysmacros.h>
42 #include <sys/statvfs.h>
43 #include <sys/kmem.h>
44 #include <sys/kstat.h>
45 #include <sys/dirent.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/vtrace.h>
49 #include <sys/mode.h>
50 #include <sys/acl.h>
51 #include <sys/nbmlock.h>
52 #include <sys/policy.h>
53 #include <sys/sdt.h>
55 #include <rpc/types.h>
56 #include <rpc/auth.h>
57 #include <rpc/svc.h>
59 #include <nfs/nfs.h>
60 #include <nfs/export.h>
61 #include <nfs/nfs_cmd.h>
63 #include <vm/hat.h>
64 #include <vm/as.h>
65 #include <vm/seg.h>
66 #include <vm/seg_map.h>
67 #include <vm/seg_kmem.h>
69 #include <sys/strsubr.h>
72 * These are the interface routines for the server side of the
73 * Network File System. See the NFS version 2 protocol specification
74 * for a description of this interface.
77 static int sattr_to_vattr(struct nfssattr *, struct vattr *);
78 static void acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
79 cred_t *);
82 * Some "over the wire" UNIX file types. These are encoded
83 * into the mode. This needs to be fixed in the next rev.
85 #define IFMT 0170000 /* type of file */
86 #define IFCHR 0020000 /* character special */
87 #define IFBLK 0060000 /* block special */
88 #define IFSOCK 0140000 /* socket */
90 u_longlong_t nfs2_srv_caller_id;
93 * Get file attributes.
94 * Returns the current attributes of the file with the given fhandle.
96 /* ARGSUSED */
97 void
98 rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
99 struct svc_req *req, cred_t *cr)
101 int error;
102 vnode_t *vp;
103 struct vattr va;
105 vp = nfs_fhtovp(fhp, exi);
106 if (vp == NULL) {
107 ns->ns_status = NFSERR_STALE;
108 return;
112 * Do the getattr.
114 va.va_mask = AT_ALL; /* we want all the attributes */
116 error = rfs4_delegated_getattr(vp, &va, 0, cr);
118 /* check for overflows */
119 if (!error) {
120 /* Lie about the object type for a referral */
121 if (vn_is_nfs_reparse(vp, cr))
122 va.va_type = VLNK;
124 acl_perm(vp, exi, &va, cr);
125 error = vattr_to_nattr(&va, &ns->ns_attr);
128 VN_RELE(vp);
130 ns->ns_status = puterrno(error);
132 void *
133 rfs_getattr_getfh(fhandle_t *fhp)
135 return (fhp);
139 * Set file attributes.
140 * Sets the attributes of the file with the given fhandle. Returns
141 * the new attributes.
143 void
144 rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
145 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
147 int error;
148 int flag;
149 int in_crit = 0;
150 vnode_t *vp;
151 struct vattr va;
152 struct vattr bva;
153 struct flock64 bf;
154 caller_context_t ct;
157 vp = nfs_fhtovp(&args->saa_fh, exi);
158 if (vp == NULL) {
159 ns->ns_status = NFSERR_STALE;
160 return;
163 if (rdonly(exi, req) || vn_is_readonly(vp)) {
164 VN_RELE(vp);
165 ns->ns_status = NFSERR_ROFS;
166 return;
169 error = sattr_to_vattr(&args->saa_sa, &va);
170 if (error) {
171 VN_RELE(vp);
172 ns->ns_status = puterrno(error);
173 return;
177 * If the client is requesting a change to the mtime,
178 * but the nanosecond field is set to 1 billion, then
179 * this is a flag to the server that it should set the
180 * atime and mtime fields to the server's current time.
181 * The 1 billion number actually came from the client
182 * as 1 million, but the units in the over the wire
183 * request are microseconds instead of nanoseconds.
185 * This is an overload of the protocol and should be
186 * documented in the NFS Version 2 protocol specification.
188 if (va.va_mask & AT_MTIME) {
189 if (va.va_mtime.tv_nsec == 1000000000) {
190 gethrestime(&va.va_mtime);
191 va.va_atime = va.va_mtime;
192 va.va_mask |= AT_ATIME;
193 flag = 0;
194 } else
195 flag = ATTR_UTIME;
196 } else
197 flag = 0;
200 * If the filesystem is exported with nosuid, then mask off
201 * the setuid and setgid bits.
203 if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
204 (exi->exi_export.ex_flags & EX_NOSUID))
205 va.va_mode &= ~(VSUID | VSGID);
207 ct.cc_sysid = 0;
208 ct.cc_pid = 0;
209 ct.cc_caller_id = nfs2_srv_caller_id;
210 ct.cc_flags = CC_DONTBLOCK;
213 * We need to specially handle size changes because it is
214 * possible for the client to create a file with modes
215 * which indicate read-only, but with the file opened for
216 * writing. If the client then tries to set the size of
217 * the file, then the normal access checking done in
218 * VOP_SETATTR would prevent the client from doing so,
219 * although it should be legal for it to do so. To get
220 * around this, we do the access checking for ourselves
221 * and then use VOP_SPACE which doesn't do the access
222 * checking which VOP_SETATTR does. VOP_SPACE can only
223 * operate on VREG files, let VOP_SETATTR handle the other
224 * extremely rare cases.
225 * Also the client should not be allowed to change the
226 * size of the file if there is a conflicting non-blocking
227 * mandatory lock in the region of change.
229 if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
230 if (nbl_need_check(vp)) {
231 nbl_start_crit(vp, RW_READER);
232 in_crit = 1;
235 bva.va_mask = AT_UID | AT_SIZE;
237 error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
239 if (error) {
240 if (in_crit)
241 nbl_end_crit(vp);
242 VN_RELE(vp);
243 ns->ns_status = puterrno(error);
244 return;
247 if (in_crit) {
248 u_offset_t offset;
249 ssize_t length;
251 if (va.va_size < bva.va_size) {
252 offset = va.va_size;
253 length = bva.va_size - va.va_size;
254 } else {
255 offset = bva.va_size;
256 length = va.va_size - bva.va_size;
258 if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
259 NULL)) {
260 error = EACCES;
264 if (crgetuid(cr) == bva.va_uid && !error &&
265 va.va_size != bva.va_size) {
266 va.va_mask &= ~AT_SIZE;
267 bf.l_type = F_WRLCK;
268 bf.l_whence = 0;
269 bf.l_start = (off64_t)va.va_size;
270 bf.l_len = 0;
271 bf.l_sysid = 0;
272 bf.l_pid = 0;
274 error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
275 (offset_t)va.va_size, cr, &ct);
277 if (in_crit)
278 nbl_end_crit(vp);
279 } else
280 error = 0;
283 * Do the setattr.
285 if (!error && va.va_mask) {
286 error = VOP_SETATTR(vp, &va, flag, cr, &ct);
290 * check if the monitor on either vop_space or vop_setattr detected
291 * a delegation conflict and if so, mark the thread flag as
292 * wouldblock so that the response is dropped and the client will
293 * try again.
295 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
296 VN_RELE(vp);
297 curthread->t_flag |= T_WOULDBLOCK;
298 return;
301 if (!error) {
302 va.va_mask = AT_ALL; /* get everything */
304 error = rfs4_delegated_getattr(vp, &va, 0, cr);
306 /* check for overflows */
307 if (!error) {
308 acl_perm(vp, exi, &va, cr);
309 error = vattr_to_nattr(&va, &ns->ns_attr);
313 ct.cc_flags = 0;
316 * Force modified metadata out to stable storage.
318 (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
320 VN_RELE(vp);
322 ns->ns_status = puterrno(error);
324 void *
325 rfs_setattr_getfh(struct nfssaargs *args)
327 return (&args->saa_fh);
331 * Directory lookup.
332 * Returns an fhandle and file attributes for file name in a directory.
334 /* ARGSUSED */
335 void
336 rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
337 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
339 int error;
340 vnode_t *dvp;
341 vnode_t *vp;
342 struct vattr va;
343 fhandle_t *fhp = da->da_fhandle;
344 struct sec_ol sec = {0, 0};
345 bool_t publicfh_flag = FALSE, auth_weak = FALSE;
346 char *name;
347 struct sockaddr *ca;
350 * Trusted Extension doesn't support NFSv2. MOUNT
351 * will reject v2 clients. Need to prevent v2 client
352 * access via WebNFS here.
354 if (is_system_labeled() && req->rq_vers == 2) {
355 dr->dr_status = NFSERR_ACCES;
356 return;
360 * Disallow NULL paths
362 if (da->da_name == NULL || *da->da_name == '\0') {
363 dr->dr_status = NFSERR_ACCES;
364 return;
368 * Allow lookups from the root - the default
369 * location of the public filehandle.
371 if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
372 dvp = rootdir;
373 VN_HOLD(dvp);
374 } else {
375 dvp = nfs_fhtovp(fhp, exi);
376 if (dvp == NULL) {
377 dr->dr_status = NFSERR_STALE;
378 return;
383 * Not allow lookup beyond root.
384 * If the filehandle matches a filehandle of the exi,
385 * then the ".." refers beyond the root of an exported filesystem.
387 if (strcmp(da->da_name, "..") == 0 &&
388 EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
389 VN_RELE(dvp);
390 dr->dr_status = NFSERR_NOENT;
391 return;
394 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
395 name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
396 MAXPATHLEN);
398 if (name == NULL) {
399 dr->dr_status = NFSERR_ACCES;
400 return;
403 exi_hold(exi);
406 * If the public filehandle is used then allow
407 * a multi-component lookup, i.e. evaluate
408 * a pathname and follow symbolic links if
409 * necessary.
411 * This may result in a vnode in another filesystem
412 * which is OK as long as the filesystem is exported.
414 if (PUBLIC_FH2(fhp)) {
415 struct exportinfo *new;
417 publicfh_flag = TRUE;
418 error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &new,
419 &sec);
421 if (error == 0) {
422 exi_rele(exi);
423 exi = new;
425 } else {
427 * Do a normal single component lookup.
429 error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
430 NULL, NULL, NULL);
433 if (name != da->da_name)
434 kmem_free(name, MAXPATHLEN);
437 if (!error) {
438 va.va_mask = AT_ALL; /* we want everything */
440 error = rfs4_delegated_getattr(vp, &va, 0, cr);
442 /* check for overflows */
443 if (!error) {
444 acl_perm(vp, exi, &va, cr);
445 error = vattr_to_nattr(&va, &dr->dr_attr);
446 if (!error) {
447 if (sec.sec_flags & SEC_QUERY)
448 error = makefh_ol(&dr->dr_fhandle, exi,
449 sec.sec_index);
450 else {
451 error = makefh(&dr->dr_fhandle, vp,
452 exi);
453 if (!error && publicfh_flag &&
454 !chk_clnt_sec(exi, req))
455 auth_weak = TRUE;
459 VN_RELE(vp);
462 VN_RELE(dvp);
465 * The passed argument exportinfo is released by the
466 * caller, comon_dispatch
468 exi_rele(exi);
471 * If it's public fh, no 0x81, and client's flavor is
472 * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
473 * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
475 if (auth_weak)
476 dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
477 else
478 dr->dr_status = puterrno(error);
480 void *
481 rfs_lookup_getfh(struct nfsdiropargs *da)
483 return (da->da_fhandle);
487 * Read symbolic link.
488 * Returns the string in the symbolic link at the given fhandle.
490 /* ARGSUSED */
491 void
492 rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
493 struct svc_req *req, cred_t *cr)
495 int error;
496 struct iovec iov;
497 struct uio uio;
498 vnode_t *vp;
499 struct vattr va;
500 struct sockaddr *ca;
501 char *name = NULL;
502 int is_referral = 0;
504 vp = nfs_fhtovp(fhp, exi);
505 if (vp == NULL) {
506 rl->rl_data = NULL;
507 rl->rl_status = NFSERR_STALE;
508 return;
511 va.va_mask = AT_MODE;
513 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
515 if (error) {
516 VN_RELE(vp);
517 rl->rl_data = NULL;
518 rl->rl_status = puterrno(error);
519 return;
522 if (MANDLOCK(vp, va.va_mode)) {
523 VN_RELE(vp);
524 rl->rl_data = NULL;
525 rl->rl_status = NFSERR_ACCES;
526 return;
529 /* We lied about the object type for a referral */
530 if (vn_is_nfs_reparse(vp, cr))
531 is_referral = 1;
534 * XNFS and RFC1094 require us to return ENXIO if argument
535 * is not a link. BUGID 1138002.
537 if (vp->v_type != VLNK && !is_referral) {
538 VN_RELE(vp);
539 rl->rl_data = NULL;
540 rl->rl_status = NFSERR_NXIO;
541 return;
545 * Allocate data for pathname. This will be freed by rfs_rlfree.
547 rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
549 if (is_referral) {
550 char *s;
551 size_t strsz;
553 /* Get an artificial symlink based on a referral */
554 s = build_symlink(vp, cr, &strsz);
555 global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
556 DTRACE_PROBE2(nfs2serv__func__referral__reflink,
557 vnode_t *, vp, char *, s);
558 if (s == NULL)
559 error = EINVAL;
560 else {
561 error = 0;
562 (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
563 rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
564 kmem_free(s, strsz);
567 } else {
570 * Set up io vector to read sym link data
572 iov.iov_base = rl->rl_data;
573 iov.iov_len = NFS_MAXPATHLEN;
574 uio.uio_iov = &iov;
575 uio.uio_iovcnt = 1;
576 uio.uio_segflg = UIO_SYSSPACE;
577 uio.uio_extflg = UIO_COPY_CACHED;
578 uio.uio_loffset = (offset_t)0;
579 uio.uio_resid = NFS_MAXPATHLEN;
582 * Do the readlink.
584 error = VOP_READLINK(vp, &uio, cr, NULL);
586 rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
588 if (!error)
589 rl->rl_data[rl->rl_count] = '\0';
594 VN_RELE(vp);
596 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
597 name = nfscmd_convname(ca, exi, rl->rl_data,
598 NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
600 if (name != NULL && name != rl->rl_data) {
601 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
602 rl->rl_data = name;
606 * XNFS and RFC1094 require us to return ENXIO if argument
607 * is not a link. UFS returns EINVAL if this is the case,
608 * so we do the mapping here. BUGID 1138002.
610 if (error == EINVAL)
611 rl->rl_status = NFSERR_NXIO;
612 else
613 rl->rl_status = puterrno(error);
616 void *
617 rfs_readlink_getfh(fhandle_t *fhp)
619 return (fhp);
622 * Free data allocated by rfs_readlink
624 void
625 rfs_rlfree(struct nfsrdlnres *rl)
627 if (rl->rl_data != NULL)
628 kmem_free(rl->rl_data, NFS_MAXPATHLEN);
631 static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
634 * Read data.
635 * Returns some data read from the file at the given fhandle.
637 /* ARGSUSED */
638 void
639 rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
640 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
642 vnode_t *vp;
643 int error;
644 struct vattr va;
645 struct iovec iov;
646 struct uio uio;
647 mblk_t *mp;
648 int alloc_err = 0;
649 int in_crit = 0;
650 caller_context_t ct;
652 vp = nfs_fhtovp(&ra->ra_fhandle, exi);
653 if (vp == NULL) {
654 rr->rr_data = NULL;
655 rr->rr_status = NFSERR_STALE;
656 return;
659 if (vp->v_type != VREG) {
660 VN_RELE(vp);
661 rr->rr_data = NULL;
662 rr->rr_status = NFSERR_ISDIR;
663 return;
666 ct.cc_sysid = 0;
667 ct.cc_pid = 0;
668 ct.cc_caller_id = nfs2_srv_caller_id;
669 ct.cc_flags = CC_DONTBLOCK;
672 * Enter the critical region before calling VOP_RWLOCK
673 * to avoid a deadlock with write requests.
675 if (nbl_need_check(vp)) {
676 nbl_start_crit(vp, RW_READER);
677 if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
678 0, NULL)) {
679 nbl_end_crit(vp);
680 VN_RELE(vp);
681 rr->rr_data = NULL;
682 rr->rr_status = NFSERR_ACCES;
683 return;
685 in_crit = 1;
688 error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
690 /* check if a monitor detected a delegation conflict */
691 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
692 VN_RELE(vp);
693 /* mark as wouldblock so response is dropped */
694 curthread->t_flag |= T_WOULDBLOCK;
696 rr->rr_data = NULL;
697 return;
700 va.va_mask = AT_ALL;
702 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
704 if (error) {
705 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
706 if (in_crit)
707 nbl_end_crit(vp);
709 VN_RELE(vp);
710 rr->rr_data = NULL;
711 rr->rr_status = puterrno(error);
713 return;
717 * This is a kludge to allow reading of files created
718 * with no read permission. The owner of the file
719 * is always allowed to read it.
721 if (crgetuid(cr) != va.va_uid) {
722 error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
724 if (error) {
726 * Exec is the same as read over the net because
727 * of demand loading.
729 error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
731 if (error) {
732 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
733 if (in_crit)
734 nbl_end_crit(vp);
735 VN_RELE(vp);
736 rr->rr_data = NULL;
737 rr->rr_status = puterrno(error);
739 return;
743 if (MANDLOCK(vp, va.va_mode)) {
744 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
745 if (in_crit)
746 nbl_end_crit(vp);
748 VN_RELE(vp);
749 rr->rr_data = NULL;
750 rr->rr_status = NFSERR_ACCES;
752 return;
755 rr->rr_ok.rrok_wlist_len = 0;
756 rr->rr_ok.rrok_wlist = NULL;
758 if ((u_offset_t)ra->ra_offset >= va.va_size) {
759 rr->rr_count = 0;
760 rr->rr_data = NULL;
762 * In this case, status is NFS_OK, but there is no data
763 * to encode. So set rr_mp to NULL.
765 rr->rr_mp = NULL;
766 rr->rr_ok.rrok_wlist = ra->ra_wlist;
767 if (rr->rr_ok.rrok_wlist)
768 clist_zero_len(rr->rr_ok.rrok_wlist);
769 goto done;
772 if (ra->ra_wlist) {
773 mp = NULL;
774 rr->rr_mp = NULL;
775 (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
776 if (ra->ra_count > iov.iov_len) {
777 rr->rr_data = NULL;
778 rr->rr_status = NFSERR_INVAL;
779 goto done;
781 } else {
783 * mp will contain the data to be sent out in the read reply.
784 * This will be freed after the reply has been sent out (by the
785 * driver).
786 * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
787 * that the call to xdrmblk_putmblk() never fails.
789 mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
790 &alloc_err);
791 ASSERT(mp != NULL);
792 ASSERT(alloc_err == 0);
794 rr->rr_mp = mp;
797 * Set up io vector
799 iov.iov_base = (caddr_t)mp->b_datap->db_base;
800 iov.iov_len = ra->ra_count;
803 uio.uio_iov = &iov;
804 uio.uio_iovcnt = 1;
805 uio.uio_segflg = UIO_SYSSPACE;
806 uio.uio_extflg = UIO_COPY_CACHED;
807 uio.uio_loffset = (offset_t)ra->ra_offset;
808 uio.uio_resid = ra->ra_count;
810 error = VOP_READ(vp, &uio, 0, cr, &ct);
812 if (error) {
813 if (mp)
814 freeb(mp);
817 * check if a monitor detected a delegation conflict and
818 * mark as wouldblock so response is dropped
820 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
821 curthread->t_flag |= T_WOULDBLOCK;
822 else
823 rr->rr_status = puterrno(error);
825 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
826 if (in_crit)
827 nbl_end_crit(vp);
829 VN_RELE(vp);
830 rr->rr_data = NULL;
832 return;
836 * Get attributes again so we can send the latest access
837 * time to the client side for his cache.
839 va.va_mask = AT_ALL;
841 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
843 if (error) {
844 if (mp)
845 freeb(mp);
847 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
848 if (in_crit)
849 nbl_end_crit(vp);
851 VN_RELE(vp);
852 rr->rr_data = NULL;
853 rr->rr_status = puterrno(error);
855 return;
858 rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
860 if (mp) {
861 rr->rr_data = (char *)mp->b_datap->db_base;
862 } else {
863 if (ra->ra_wlist) {
864 rr->rr_data = (caddr_t)iov.iov_base;
865 if (!rdma_setup_read_data2(ra, rr)) {
866 rr->rr_data = NULL;
867 rr->rr_status = puterrno(NFSERR_INVAL);
871 done:
872 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
873 if (in_crit)
874 nbl_end_crit(vp);
876 acl_perm(vp, exi, &va, cr);
878 /* check for overflows */
879 error = vattr_to_nattr(&va, &rr->rr_attr);
881 VN_RELE(vp);
883 rr->rr_status = puterrno(error);
887 * Free data allocated by rfs_read
889 void
890 rfs_rdfree(struct nfsrdresult *rr)
892 mblk_t *mp;
894 if (rr->rr_status == NFS_OK) {
895 mp = rr->rr_mp;
896 if (mp != NULL)
897 freeb(mp);
901 void *
902 rfs_read_getfh(struct nfsreadargs *ra)
904 return (&ra->ra_fhandle);
907 #define MAX_IOVECS 12
909 #ifdef DEBUG
910 static int rfs_write_sync_hits = 0;
911 static int rfs_write_sync_misses = 0;
912 #endif
915 * Write data to file.
916 * Returns attributes of a file after writing some data to it.
918 * Any changes made here, especially in error handling might have
919 * to also be done in rfs_write (which clusters write requests).
921 void
922 rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
923 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
925 int error;
926 vnode_t *vp;
927 rlim64_t rlimit;
928 struct vattr va;
929 struct uio uio;
930 struct iovec iov[MAX_IOVECS];
931 mblk_t *m;
932 struct iovec *iovp;
933 int iovcnt;
934 cred_t *savecred;
935 int in_crit = 0;
936 caller_context_t ct;
938 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
939 if (vp == NULL) {
940 ns->ns_status = NFSERR_STALE;
941 return;
944 if (rdonly(exi, req)) {
945 VN_RELE(vp);
946 ns->ns_status = NFSERR_ROFS;
947 return;
950 if (vp->v_type != VREG) {
951 VN_RELE(vp);
952 ns->ns_status = NFSERR_ISDIR;
953 return;
956 ct.cc_sysid = 0;
957 ct.cc_pid = 0;
958 ct.cc_caller_id = nfs2_srv_caller_id;
959 ct.cc_flags = CC_DONTBLOCK;
961 va.va_mask = AT_UID|AT_MODE;
963 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
965 if (error) {
966 VN_RELE(vp);
967 ns->ns_status = puterrno(error);
969 return;
972 if (crgetuid(cr) != va.va_uid) {
974 * This is a kludge to allow writes of files created
975 * with read only permission. The owner of the file
976 * is always allowed to write it.
978 error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
980 if (error) {
981 VN_RELE(vp);
982 ns->ns_status = puterrno(error);
983 return;
988 * Can't access a mandatory lock file. This might cause
989 * the NFS service thread to block forever waiting for a
990 * lock to be released that will never be released.
992 if (MANDLOCK(vp, va.va_mode)) {
993 VN_RELE(vp);
994 ns->ns_status = NFSERR_ACCES;
995 return;
999 * We have to enter the critical region before calling VOP_RWLOCK
1000 * to avoid a deadlock with ufs.
1002 if (nbl_need_check(vp)) {
1003 nbl_start_crit(vp, RW_READER);
1004 in_crit = 1;
1005 if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1006 wa->wa_count, 0, NULL)) {
1007 error = EACCES;
1008 goto out;
1012 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1014 /* check if a monitor detected a delegation conflict */
1015 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1016 VN_RELE(vp);
1017 /* mark as wouldblock so response is dropped */
1018 curthread->t_flag |= T_WOULDBLOCK;
1019 return;
1022 if (wa->wa_data || wa->wa_rlist) {
1023 /* Do the RDMA thing if necessary */
1024 if (wa->wa_rlist) {
1025 iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1026 iov[0].iov_len = wa->wa_count;
1027 } else {
1028 iov[0].iov_base = wa->wa_data;
1029 iov[0].iov_len = wa->wa_count;
1031 uio.uio_iov = iov;
1032 uio.uio_iovcnt = 1;
1033 uio.uio_segflg = UIO_SYSSPACE;
1034 uio.uio_extflg = UIO_COPY_DEFAULT;
1035 uio.uio_loffset = (offset_t)wa->wa_offset;
1036 uio.uio_resid = wa->wa_count;
1038 * The limit is checked on the client. We
1039 * should allow any size writes here.
1041 uio.uio_llimit = curproc->p_fsz_ctl;
1042 rlimit = uio.uio_llimit - wa->wa_offset;
1043 if (rlimit < (rlim64_t)uio.uio_resid)
1044 uio.uio_resid = (uint_t)rlimit;
1047 * for now we assume no append mode
1050 * We're changing creds because VM may fault and we need
1051 * the cred of the current thread to be used if quota
1052 * checking is enabled.
1054 savecred = curthread->t_cred;
1055 curthread->t_cred = cr;
1056 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1057 curthread->t_cred = savecred;
1058 } else {
1059 iovcnt = 0;
1060 for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1061 iovcnt++;
1062 if (iovcnt <= MAX_IOVECS) {
1063 #ifdef DEBUG
1064 rfs_write_sync_hits++;
1065 #endif
1066 iovp = iov;
1067 } else {
1068 #ifdef DEBUG
1069 rfs_write_sync_misses++;
1070 #endif
1071 iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1073 mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1074 uio.uio_iov = iovp;
1075 uio.uio_iovcnt = iovcnt;
1076 uio.uio_segflg = UIO_SYSSPACE;
1077 uio.uio_extflg = UIO_COPY_DEFAULT;
1078 uio.uio_loffset = (offset_t)wa->wa_offset;
1079 uio.uio_resid = wa->wa_count;
1081 * The limit is checked on the client. We
1082 * should allow any size writes here.
1084 uio.uio_llimit = curproc->p_fsz_ctl;
1085 rlimit = uio.uio_llimit - wa->wa_offset;
1086 if (rlimit < (rlim64_t)uio.uio_resid)
1087 uio.uio_resid = (uint_t)rlimit;
1090 * For now we assume no append mode.
1093 * We're changing creds because VM may fault and we need
1094 * the cred of the current thread to be used if quota
1095 * checking is enabled.
1097 savecred = curthread->t_cred;
1098 curthread->t_cred = cr;
1099 error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1100 curthread->t_cred = savecred;
1102 if (iovp != iov)
1103 kmem_free(iovp, sizeof (*iovp) * iovcnt);
1106 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1108 if (!error) {
1110 * Get attributes again so we send the latest mod
1111 * time to the client side for his cache.
1113 va.va_mask = AT_ALL; /* now we want everything */
1115 error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1117 /* check for overflows */
1118 if (!error) {
1119 acl_perm(vp, exi, &va, cr);
1120 error = vattr_to_nattr(&va, &ns->ns_attr);
1124 out:
1125 if (in_crit)
1126 nbl_end_crit(vp);
1127 VN_RELE(vp);
1129 /* check if a monitor detected a delegation conflict */
1130 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1131 /* mark as wouldblock so response is dropped */
1132 curthread->t_flag |= T_WOULDBLOCK;
1133 else
1134 ns->ns_status = puterrno(error);
1138 struct rfs_async_write {
1139 struct nfswriteargs *wa;
1140 struct nfsattrstat *ns;
1141 struct svc_req *req;
1142 cred_t *cr;
1143 kthread_t *thread;
1144 struct rfs_async_write *list;
1147 struct rfs_async_write_list {
1148 fhandle_t *fhp;
1149 kcondvar_t cv;
1150 struct rfs_async_write *list;
1151 struct rfs_async_write_list *next;
1154 static struct rfs_async_write_list *rfs_async_write_head = NULL;
1155 static kmutex_t rfs_async_write_lock;
1156 static int rfs_write_async = 1; /* enables write clustering if == 1 */
1158 #define MAXCLIOVECS 42
1159 #define RFSWRITE_INITVAL (enum nfsstat) -1
1161 #ifdef DEBUG
1162 static int rfs_write_hits = 0;
1163 static int rfs_write_misses = 0;
1164 #endif
1167 * Write data to file.
1168 * Returns attributes of a file after writing some data to it.
1170 void
1171 rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1172 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1174 int error;
1175 vnode_t *vp;
1176 rlim64_t rlimit;
1177 struct vattr va;
1178 struct uio uio;
1179 struct rfs_async_write_list *lp;
1180 struct rfs_async_write_list *nlp;
1181 struct rfs_async_write *rp;
1182 struct rfs_async_write *nrp;
1183 struct rfs_async_write *trp;
1184 struct rfs_async_write *lrp;
1185 int data_written;
1186 int iovcnt;
1187 mblk_t *m;
1188 struct iovec *iovp;
1189 struct iovec *niovp;
1190 struct iovec iov[MAXCLIOVECS];
1191 int count;
1192 int rcount;
1193 uint_t off;
1194 uint_t len;
1195 struct rfs_async_write nrpsp;
1196 struct rfs_async_write_list nlpsp;
1197 ushort_t t_flag;
1198 cred_t *savecred;
1199 int in_crit = 0;
1200 caller_context_t ct;
1202 if (!rfs_write_async) {
1203 rfs_write_sync(wa, ns, exi, req, cr);
1204 return;
1208 * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1209 * is considered an OK.
1211 ns->ns_status = RFSWRITE_INITVAL;
1213 nrp = &nrpsp;
1214 nrp->wa = wa;
1215 nrp->ns = ns;
1216 nrp->req = req;
1217 nrp->cr = cr;
1218 nrp->thread = curthread;
1220 ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1223 * Look to see if there is already a cluster started
1224 * for this file.
1226 mutex_enter(&rfs_async_write_lock);
1227 for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1228 if (bcmp(&wa->wa_fhandle, lp->fhp,
1229 sizeof (fhandle_t)) == 0)
1230 break;
1234 * If lp is non-NULL, then there is already a cluster
1235 * started. We need to place ourselves in the cluster
1236 * list in the right place as determined by starting
1237 * offset. Conflicts with non-blocking mandatory locked
1238 * regions will be checked when the cluster is processed.
1240 if (lp != NULL) {
1241 rp = lp->list;
1242 trp = NULL;
1243 while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1244 trp = rp;
1245 rp = rp->list;
1247 nrp->list = rp;
1248 if (trp == NULL)
1249 lp->list = nrp;
1250 else
1251 trp->list = nrp;
1252 while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1253 cv_wait(&lp->cv, &rfs_async_write_lock);
1254 mutex_exit(&rfs_async_write_lock);
1256 return;
1260 * No cluster started yet, start one and add ourselves
1261 * to the list of clusters.
1263 nrp->list = NULL;
1265 nlp = &nlpsp;
1266 nlp->fhp = &wa->wa_fhandle;
1267 cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1268 nlp->list = nrp;
1269 nlp->next = NULL;
1271 if (rfs_async_write_head == NULL) {
1272 rfs_async_write_head = nlp;
1273 } else {
1274 lp = rfs_async_write_head;
1275 while (lp->next != NULL)
1276 lp = lp->next;
1277 lp->next = nlp;
1279 mutex_exit(&rfs_async_write_lock);
1282 * Convert the file handle common to all of the requests
1283 * in this cluster to a vnode.
1285 vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1286 if (vp == NULL) {
1287 mutex_enter(&rfs_async_write_lock);
1288 if (rfs_async_write_head == nlp)
1289 rfs_async_write_head = nlp->next;
1290 else {
1291 lp = rfs_async_write_head;
1292 while (lp->next != nlp)
1293 lp = lp->next;
1294 lp->next = nlp->next;
1296 t_flag = curthread->t_flag & T_WOULDBLOCK;
1297 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1298 rp->ns->ns_status = NFSERR_STALE;
1299 rp->thread->t_flag |= t_flag;
1301 cv_broadcast(&nlp->cv);
1302 mutex_exit(&rfs_async_write_lock);
1304 return;
1308 * Can only write regular files. Attempts to write any
1309 * other file types fail with EISDIR.
1311 if (vp->v_type != VREG) {
1312 VN_RELE(vp);
1313 mutex_enter(&rfs_async_write_lock);
1314 if (rfs_async_write_head == nlp)
1315 rfs_async_write_head = nlp->next;
1316 else {
1317 lp = rfs_async_write_head;
1318 while (lp->next != nlp)
1319 lp = lp->next;
1320 lp->next = nlp->next;
1322 t_flag = curthread->t_flag & T_WOULDBLOCK;
1323 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1324 rp->ns->ns_status = NFSERR_ISDIR;
1325 rp->thread->t_flag |= t_flag;
1327 cv_broadcast(&nlp->cv);
1328 mutex_exit(&rfs_async_write_lock);
1330 return;
1334 * Enter the critical region before calling VOP_RWLOCK, to avoid a
1335 * deadlock with ufs.
1337 if (nbl_need_check(vp)) {
1338 nbl_start_crit(vp, RW_READER);
1339 in_crit = 1;
1342 ct.cc_sysid = 0;
1343 ct.cc_pid = 0;
1344 ct.cc_caller_id = nfs2_srv_caller_id;
1345 ct.cc_flags = CC_DONTBLOCK;
1348 * Lock the file for writing. This operation provides
1349 * the delay which allows clusters to grow.
1351 error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1353 /* check if a monitor detected a delegation conflict */
1354 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1355 if (in_crit)
1356 nbl_end_crit(vp);
1357 VN_RELE(vp);
1358 /* mark as wouldblock so response is dropped */
1359 curthread->t_flag |= T_WOULDBLOCK;
1360 mutex_enter(&rfs_async_write_lock);
1361 if (rfs_async_write_head == nlp)
1362 rfs_async_write_head = nlp->next;
1363 else {
1364 lp = rfs_async_write_head;
1365 while (lp->next != nlp)
1366 lp = lp->next;
1367 lp->next = nlp->next;
1369 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1370 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1371 rp->ns->ns_status = puterrno(error);
1372 rp->thread->t_flag |= T_WOULDBLOCK;
1375 cv_broadcast(&nlp->cv);
1376 mutex_exit(&rfs_async_write_lock);
1378 return;
1382 * Disconnect this cluster from the list of clusters.
1383 * The cluster that is being dealt with must be fixed
1384 * in size after this point, so there is no reason
1385 * to leave it on the list so that new requests can
1386 * find it.
1388 * The algorithm is that the first write request will
1389 * create a cluster, convert the file handle to a
1390 * vnode pointer, and then lock the file for writing.
1391 * This request is not likely to be clustered with
1392 * any others. However, the next request will create
1393 * a new cluster and be blocked in VOP_RWLOCK while
1394 * the first request is being processed. This delay
1395 * will allow more requests to be clustered in this
1396 * second cluster.
1398 mutex_enter(&rfs_async_write_lock);
1399 if (rfs_async_write_head == nlp)
1400 rfs_async_write_head = nlp->next;
1401 else {
1402 lp = rfs_async_write_head;
1403 while (lp->next != nlp)
1404 lp = lp->next;
1405 lp->next = nlp->next;
1407 mutex_exit(&rfs_async_write_lock);
1410 * Step through the list of requests in this cluster.
1411 * We need to check permissions to make sure that all
1412 * of the requests have sufficient permission to write
1413 * the file. A cluster can be composed of requests
1414 * from different clients and different users on each
1415 * client.
1417 * As a side effect, we also calculate the size of the
1418 * byte range that this cluster encompasses.
1420 rp = nlp->list;
1421 off = rp->wa->wa_offset;
1422 len = (uint_t)0;
1423 do {
1424 if (rdonly(exi, rp->req)) {
1425 rp->ns->ns_status = NFSERR_ROFS;
1426 t_flag = curthread->t_flag & T_WOULDBLOCK;
1427 rp->thread->t_flag |= t_flag;
1428 continue;
1431 va.va_mask = AT_UID|AT_MODE;
1433 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1435 if (!error) {
1436 if (crgetuid(rp->cr) != va.va_uid) {
1438 * This is a kludge to allow writes of files
1439 * created with read only permission. The
1440 * owner of the file is always allowed to
1441 * write it.
1443 error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1445 if (!error && MANDLOCK(vp, va.va_mode))
1446 error = EACCES;
1450 * Check for a conflict with a nbmand-locked region.
1452 if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1453 rp->wa->wa_count, 0, NULL)) {
1454 error = EACCES;
1457 if (error) {
1458 rp->ns->ns_status = puterrno(error);
1459 t_flag = curthread->t_flag & T_WOULDBLOCK;
1460 rp->thread->t_flag |= t_flag;
1461 continue;
1463 if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1464 len = rp->wa->wa_offset + rp->wa->wa_count - off;
1465 } while ((rp = rp->list) != NULL);
1468 * Step through the cluster attempting to gather as many
1469 * requests which are contiguous as possible. These
1470 * contiguous requests are handled via one call to VOP_WRITE
1471 * instead of different calls to VOP_WRITE. We also keep
1472 * track of the fact that any data was written.
1474 rp = nlp->list;
1475 data_written = 0;
1476 do {
1478 * Skip any requests which are already marked as having an
1479 * error.
1481 if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1482 rp = rp->list;
1483 continue;
1487 * Count the number of iovec's which are required
1488 * to handle this set of requests. One iovec is
1489 * needed for each data buffer, whether addressed
1490 * by wa_data or by the b_rptr pointers in the
1491 * mblk chains.
1493 iovcnt = 0;
1494 lrp = rp;
1495 for (;;) {
1496 if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1497 iovcnt++;
1498 else {
1499 m = lrp->wa->wa_mblk;
1500 while (m != NULL) {
1501 iovcnt++;
1502 m = m->b_cont;
1505 if (lrp->list == NULL ||
1506 lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1507 lrp->wa->wa_offset + lrp->wa->wa_count !=
1508 lrp->list->wa->wa_offset) {
1509 lrp = lrp->list;
1510 break;
1512 lrp = lrp->list;
1515 if (iovcnt <= MAXCLIOVECS) {
1516 #ifdef DEBUG
1517 rfs_write_hits++;
1518 #endif
1519 niovp = iov;
1520 } else {
1521 #ifdef DEBUG
1522 rfs_write_misses++;
1523 #endif
1524 niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1527 * Put together the scatter/gather iovecs.
1529 iovp = niovp;
1530 trp = rp;
1531 count = 0;
1532 do {
1533 if (trp->wa->wa_data || trp->wa->wa_rlist) {
1534 if (trp->wa->wa_rlist) {
1535 iovp->iov_base =
1536 (char *)((trp->wa->wa_rlist)->
1537 u.c_daddr3);
1538 iovp->iov_len = trp->wa->wa_count;
1539 } else {
1540 iovp->iov_base = trp->wa->wa_data;
1541 iovp->iov_len = trp->wa->wa_count;
1543 iovp++;
1544 } else {
1545 m = trp->wa->wa_mblk;
1546 rcount = trp->wa->wa_count;
1547 while (m != NULL) {
1548 iovp->iov_base = (caddr_t)m->b_rptr;
1549 iovp->iov_len = (m->b_wptr - m->b_rptr);
1550 rcount -= iovp->iov_len;
1551 if (rcount < 0)
1552 iovp->iov_len += rcount;
1553 iovp++;
1554 if (rcount <= 0)
1555 break;
1556 m = m->b_cont;
1559 count += trp->wa->wa_count;
1560 trp = trp->list;
1561 } while (trp != lrp);
1563 uio.uio_iov = niovp;
1564 uio.uio_iovcnt = iovcnt;
1565 uio.uio_segflg = UIO_SYSSPACE;
1566 uio.uio_extflg = UIO_COPY_DEFAULT;
1567 uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1568 uio.uio_resid = count;
1570 * The limit is checked on the client. We
1571 * should allow any size writes here.
1573 uio.uio_llimit = curproc->p_fsz_ctl;
1574 rlimit = uio.uio_llimit - rp->wa->wa_offset;
1575 if (rlimit < (rlim64_t)uio.uio_resid)
1576 uio.uio_resid = (uint_t)rlimit;
1579 * For now we assume no append mode.
1583 * We're changing creds because VM may fault
1584 * and we need the cred of the current
1585 * thread to be used if quota * checking is
1586 * enabled.
1588 savecred = curthread->t_cred;
1589 curthread->t_cred = cr;
1590 error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1591 curthread->t_cred = savecred;
1593 /* check if a monitor detected a delegation conflict */
1594 if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1595 /* mark as wouldblock so response is dropped */
1596 curthread->t_flag |= T_WOULDBLOCK;
1598 if (niovp != iov)
1599 kmem_free(niovp, sizeof (*niovp) * iovcnt);
1601 if (!error) {
1602 data_written = 1;
1604 * Get attributes again so we send the latest mod
1605 * time to the client side for his cache.
1607 va.va_mask = AT_ALL; /* now we want everything */
1609 error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1611 if (!error)
1612 acl_perm(vp, exi, &va, rp->cr);
1616 * Fill in the status responses for each request
1617 * which was just handled. Also, copy the latest
1618 * attributes in to the attribute responses if
1619 * appropriate.
1621 t_flag = curthread->t_flag & T_WOULDBLOCK;
1622 do {
1623 rp->thread->t_flag |= t_flag;
1624 /* check for overflows */
1625 if (!error) {
1626 error = vattr_to_nattr(&va, &rp->ns->ns_attr);
1628 rp->ns->ns_status = puterrno(error);
1629 rp = rp->list;
1630 } while (rp != lrp);
1631 } while (rp != NULL);
1634 * If any data was written at all, then we need to flush
1635 * the data and metadata to stable storage.
1637 if (data_written) {
1638 error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1640 if (!error) {
1641 error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1645 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1647 if (in_crit)
1648 nbl_end_crit(vp);
1649 VN_RELE(vp);
1651 t_flag = curthread->t_flag & T_WOULDBLOCK;
1652 mutex_enter(&rfs_async_write_lock);
1653 for (rp = nlp->list; rp != NULL; rp = rp->list) {
1654 if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1655 rp->ns->ns_status = puterrno(error);
1656 rp->thread->t_flag |= t_flag;
1659 cv_broadcast(&nlp->cv);
1660 mutex_exit(&rfs_async_write_lock);
1664 void *
1665 rfs_write_getfh(struct nfswriteargs *wa)
1667 return (&wa->wa_fhandle);
1671 * Create a file.
1672 * Creates a file with given attributes and returns those attributes
1673 * and an fhandle for the new file.
1675 void
1676 rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1677 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1679 int error;
1680 int lookuperr;
1681 int in_crit = 0;
1682 struct vattr va;
1683 vnode_t *vp;
1684 vnode_t *realvp;
1685 vnode_t *dvp;
1686 char *name = args->ca_da.da_name;
1687 vnode_t *tvp = NULL;
1688 int mode;
1689 int lookup_ok;
1690 bool_t trunc;
1691 struct sockaddr *ca;
1694 * Disallow NULL paths
1696 if (name == NULL || *name == '\0') {
1697 dr->dr_status = NFSERR_ACCES;
1698 return;
1701 dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1702 if (dvp == NULL) {
1703 dr->dr_status = NFSERR_STALE;
1704 return;
1707 error = sattr_to_vattr(args->ca_sa, &va);
1708 if (error) {
1709 dr->dr_status = puterrno(error);
1710 return;
1714 * Must specify the mode.
1716 if (!(va.va_mask & AT_MODE)) {
1717 VN_RELE(dvp);
1718 dr->dr_status = NFSERR_INVAL;
1719 return;
1723 * This is a completely gross hack to make mknod
1724 * work over the wire until we can wack the protocol
1726 if ((va.va_mode & IFMT) == IFCHR) {
1727 if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1728 va.va_type = VFIFO; /* xtra kludge for named pipe */
1729 else {
1730 va.va_type = VCHR;
1732 * uncompress the received dev_t
1733 * if the top half is zero indicating a request
1734 * from an `older style' OS.
1736 if ((va.va_size & 0xffff0000) == 0)
1737 va.va_rdev = nfsv2_expdev(va.va_size);
1738 else
1739 va.va_rdev = (dev_t)va.va_size;
1741 va.va_mask &= ~AT_SIZE;
1742 } else if ((va.va_mode & IFMT) == IFBLK) {
1743 va.va_type = VBLK;
1745 * uncompress the received dev_t
1746 * if the top half is zero indicating a request
1747 * from an `older style' OS.
1749 if ((va.va_size & 0xffff0000) == 0)
1750 va.va_rdev = nfsv2_expdev(va.va_size);
1751 else
1752 va.va_rdev = (dev_t)va.va_size;
1753 va.va_mask &= ~AT_SIZE;
1754 } else if ((va.va_mode & IFMT) == IFSOCK) {
1755 va.va_type = VSOCK;
1756 } else {
1757 va.va_type = VREG;
1759 va.va_mode &= ~IFMT;
1760 va.va_mask |= AT_TYPE;
1762 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1763 name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1764 MAXPATHLEN);
1765 if (name == NULL) {
1766 dr->dr_status = puterrno(EINVAL);
1767 return;
1771 * Why was the choice made to use VWRITE as the mode to the
1772 * call to VOP_CREATE ? This results in a bug. When a client
1773 * opens a file that already exists and is RDONLY, the second
1774 * open fails with an EACESS because of the mode.
1775 * bug ID 1054648.
1777 lookup_ok = 0;
1778 mode = VWRITE;
1779 if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1780 error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1781 NULL, NULL, NULL);
1782 if (!error) {
1783 struct vattr at;
1785 lookup_ok = 1;
1786 at.va_mask = AT_MODE;
1787 error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1788 if (!error)
1789 mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1790 VN_RELE(tvp);
1791 tvp = NULL;
1795 if (!lookup_ok) {
1796 if (rdonly(exi, req)) {
1797 error = EROFS;
1798 } else if (va.va_type != VREG && va.va_type != VFIFO &&
1799 va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1800 error = EPERM;
1801 } else {
1802 error = 0;
1807 * If file size is being modified on an already existing file
1808 * make sure that there are no conflicting non-blocking mandatory
1809 * locks in the region being manipulated. Return EACCES if there
1810 * are conflicting locks.
1812 if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1813 lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1814 NULL, NULL, NULL);
1816 if (!lookuperr &&
1817 rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1818 VN_RELE(tvp);
1819 curthread->t_flag |= T_WOULDBLOCK;
1820 goto out;
1823 if (!lookuperr && nbl_need_check(tvp)) {
1825 * The file exists. Now check if it has any
1826 * conflicting non-blocking mandatory locks
1827 * in the region being changed.
1829 struct vattr bva;
1830 u_offset_t offset;
1831 ssize_t length;
1833 nbl_start_crit(tvp, RW_READER);
1834 in_crit = 1;
1836 bva.va_mask = AT_SIZE;
1837 error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1838 if (!error) {
1839 if (va.va_size < bva.va_size) {
1840 offset = va.va_size;
1841 length = bva.va_size - va.va_size;
1842 } else {
1843 offset = bva.va_size;
1844 length = va.va_size - bva.va_size;
1846 if (length) {
1847 if (nbl_conflict(tvp, NBL_WRITE,
1848 offset, length, 0, NULL)) {
1849 error = EACCES;
1853 if (error) {
1854 nbl_end_crit(tvp);
1855 VN_RELE(tvp);
1856 in_crit = 0;
1858 } else if (tvp != NULL) {
1859 VN_RELE(tvp);
1863 if (!error) {
1865 * If filesystem is shared with nosuid the remove any
1866 * setuid/setgid bits on create.
1868 if (va.va_type == VREG &&
1869 exi->exi_export.ex_flags & EX_NOSUID)
1870 va.va_mode &= ~(VSUID | VSGID);
1872 error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1873 NULL, NULL);
1875 if (!error) {
1877 if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1878 trunc = TRUE;
1879 else
1880 trunc = FALSE;
1882 if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1883 VN_RELE(vp);
1884 curthread->t_flag |= T_WOULDBLOCK;
1885 goto out;
1887 va.va_mask = AT_ALL;
1889 error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1891 /* check for overflows */
1892 if (!error) {
1893 acl_perm(vp, exi, &va, cr);
1894 error = vattr_to_nattr(&va, &dr->dr_attr);
1895 if (!error) {
1896 error = makefh(&dr->dr_fhandle, vp,
1897 exi);
1901 * Force modified metadata out to stable storage.
1903 * if a underlying vp exists, pass it to VOP_FSYNC
1905 if (VOP_REALVP(vp, &realvp, NULL) == 0)
1906 (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1907 else
1908 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1909 VN_RELE(vp);
1912 if (in_crit) {
1913 nbl_end_crit(tvp);
1914 VN_RELE(tvp);
1919 * Force modified data and metadata out to stable storage.
1921 (void) VOP_FSYNC(dvp, 0, cr, NULL);
1923 out:
1925 VN_RELE(dvp);
1927 dr->dr_status = puterrno(error);
1929 if (name != args->ca_da.da_name)
1930 kmem_free(name, MAXPATHLEN);
1932 void *
1933 rfs_create_getfh(struct nfscreatargs *args)
1935 return (args->ca_da.da_fhandle);
1939 * Remove a file.
1940 * Remove named file from parent directory.
1942 void
1943 rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1944 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
1946 int error = 0;
1947 vnode_t *vp;
1948 vnode_t *targvp;
1949 int in_crit = 0;
1952 * Disallow NULL paths
1954 if (da->da_name == NULL || *da->da_name == '\0') {
1955 *status = NFSERR_ACCES;
1956 return;
1959 vp = nfs_fhtovp(da->da_fhandle, exi);
1960 if (vp == NULL) {
1961 *status = NFSERR_STALE;
1962 return;
1965 if (rdonly(exi, req)) {
1966 VN_RELE(vp);
1967 *status = NFSERR_ROFS;
1968 return;
1972 * Check for a conflict with a non-blocking mandatory share reservation.
1974 error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1975 NULL, cr, NULL, NULL, NULL);
1976 if (error != 0) {
1977 VN_RELE(vp);
1978 *status = puterrno(error);
1979 return;
1983 * If the file is delegated to an v4 client, then initiate
1984 * recall and drop this request (by setting T_WOULDBLOCK).
1985 * The client will eventually re-transmit the request and
1986 * (hopefully), by then, the v4 client will have returned
1987 * the delegation.
1990 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1991 VN_RELE(vp);
1992 VN_RELE(targvp);
1993 curthread->t_flag |= T_WOULDBLOCK;
1994 return;
1997 if (nbl_need_check(targvp)) {
1998 nbl_start_crit(targvp, RW_READER);
1999 in_crit = 1;
2000 if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2001 error = EACCES;
2002 goto out;
2006 error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2009 * Force modified data and metadata out to stable storage.
2011 (void) VOP_FSYNC(vp, 0, cr, NULL);
2013 out:
2014 if (in_crit)
2015 nbl_end_crit(targvp);
2016 VN_RELE(targvp);
2017 VN_RELE(vp);
2019 *status = puterrno(error);
2023 void *
2024 rfs_remove_getfh(struct nfsdiropargs *da)
2026 return (da->da_fhandle);
2030 * rename a file
2031 * Give a file (from) a new name (to).
2033 void
2034 rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2035 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2037 int error = 0;
2038 vnode_t *fromvp;
2039 vnode_t *tovp;
2040 struct exportinfo *to_exi;
2041 fhandle_t *fh;
2042 vnode_t *srcvp;
2043 vnode_t *targvp;
2044 int in_crit = 0;
2046 fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2047 if (fromvp == NULL) {
2048 *status = NFSERR_STALE;
2049 return;
2052 fh = args->rna_to.da_fhandle;
2053 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2054 if (to_exi == NULL) {
2055 VN_RELE(fromvp);
2056 *status = NFSERR_ACCES;
2057 return;
2059 exi_rele(to_exi);
2061 if (to_exi != exi) {
2062 VN_RELE(fromvp);
2063 *status = NFSERR_XDEV;
2064 return;
2067 tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2068 if (tovp == NULL) {
2069 VN_RELE(fromvp);
2070 *status = NFSERR_STALE;
2071 return;
2074 if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2075 VN_RELE(tovp);
2076 VN_RELE(fromvp);
2077 *status = NFSERR_NOTDIR;
2078 return;
2082 * Disallow NULL paths
2084 if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2085 args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2086 VN_RELE(tovp);
2087 VN_RELE(fromvp);
2088 *status = NFSERR_ACCES;
2089 return;
2092 if (rdonly(exi, req)) {
2093 VN_RELE(tovp);
2094 VN_RELE(fromvp);
2095 *status = NFSERR_ROFS;
2096 return;
2100 * Check for a conflict with a non-blocking mandatory share reservation.
2102 error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2103 NULL, cr, NULL, NULL, NULL);
2104 if (error != 0) {
2105 VN_RELE(tovp);
2106 VN_RELE(fromvp);
2107 *status = puterrno(error);
2108 return;
2111 /* Check for delegations on the source file */
2113 if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2114 VN_RELE(tovp);
2115 VN_RELE(fromvp);
2116 VN_RELE(srcvp);
2117 curthread->t_flag |= T_WOULDBLOCK;
2118 return;
2121 /* Check for delegation on the file being renamed over, if it exists */
2123 if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2124 VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2125 NULL, NULL, NULL) == 0) {
2127 if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2128 VN_RELE(tovp);
2129 VN_RELE(fromvp);
2130 VN_RELE(srcvp);
2131 VN_RELE(targvp);
2132 curthread->t_flag |= T_WOULDBLOCK;
2133 return;
2135 VN_RELE(targvp);
2139 if (nbl_need_check(srcvp)) {
2140 nbl_start_crit(srcvp, RW_READER);
2141 in_crit = 1;
2142 if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2143 error = EACCES;
2144 goto out;
2148 error = VOP_RENAME(fromvp, args->rna_from.da_name,
2149 tovp, args->rna_to.da_name, cr, NULL, 0);
2151 if (error == 0)
2152 vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2153 strlen(args->rna_to.da_name));
2156 * Force modified data and metadata out to stable storage.
2158 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2159 (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2161 out:
2162 if (in_crit)
2163 nbl_end_crit(srcvp);
2164 VN_RELE(srcvp);
2165 VN_RELE(tovp);
2166 VN_RELE(fromvp);
2168 *status = puterrno(error);
2171 void *
2172 rfs_rename_getfh(struct nfsrnmargs *args)
2174 return (args->rna_from.da_fhandle);
2178 * Link to a file.
2179 * Create a file (to) which is a hard link to the given file (from).
2181 void
2182 rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2183 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2185 int error;
2186 vnode_t *fromvp;
2187 vnode_t *tovp;
2188 struct exportinfo *to_exi;
2189 fhandle_t *fh;
2191 fromvp = nfs_fhtovp(args->la_from, exi);
2192 if (fromvp == NULL) {
2193 *status = NFSERR_STALE;
2194 return;
2197 fh = args->la_to.da_fhandle;
2198 to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2199 if (to_exi == NULL) {
2200 VN_RELE(fromvp);
2201 *status = NFSERR_ACCES;
2202 return;
2204 exi_rele(to_exi);
2206 if (to_exi != exi) {
2207 VN_RELE(fromvp);
2208 *status = NFSERR_XDEV;
2209 return;
2212 tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2213 if (tovp == NULL) {
2214 VN_RELE(fromvp);
2215 *status = NFSERR_STALE;
2216 return;
2219 if (tovp->v_type != VDIR) {
2220 VN_RELE(tovp);
2221 VN_RELE(fromvp);
2222 *status = NFSERR_NOTDIR;
2223 return;
2226 * Disallow NULL paths
2228 if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2229 VN_RELE(tovp);
2230 VN_RELE(fromvp);
2231 *status = NFSERR_ACCES;
2232 return;
2235 if (rdonly(exi, req)) {
2236 VN_RELE(tovp);
2237 VN_RELE(fromvp);
2238 *status = NFSERR_ROFS;
2239 return;
2242 error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2245 * Force modified data and metadata out to stable storage.
2247 (void) VOP_FSYNC(tovp, 0, cr, NULL);
2248 (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2250 VN_RELE(tovp);
2251 VN_RELE(fromvp);
2253 *status = puterrno(error);
2256 void *
2257 rfs_link_getfh(struct nfslinkargs *args)
2259 return (args->la_from);
2263 * Symbolicly link to a file.
2264 * Create a file (to) with the given attributes which is a symbolic link
2265 * to the given path name (to).
2267 void
2268 rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2269 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2271 int error;
2272 struct vattr va;
2273 vnode_t *vp;
2274 vnode_t *svp;
2275 int lerror;
2276 struct sockaddr *ca;
2277 char *name = NULL;
2280 * Disallow NULL paths
2282 if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2283 *status = NFSERR_ACCES;
2284 return;
2287 vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2288 if (vp == NULL) {
2289 *status = NFSERR_STALE;
2290 return;
2293 if (rdonly(exi, req)) {
2294 VN_RELE(vp);
2295 *status = NFSERR_ROFS;
2296 return;
2299 error = sattr_to_vattr(args->sla_sa, &va);
2300 if (error) {
2301 VN_RELE(vp);
2302 *status = puterrno(error);
2303 return;
2306 if (!(va.va_mask & AT_MODE)) {
2307 VN_RELE(vp);
2308 *status = NFSERR_INVAL;
2309 return;
2312 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2313 name = nfscmd_convname(ca, exi, args->sla_tnm,
2314 NFSCMD_CONV_INBOUND, MAXPATHLEN);
2316 if (name == NULL) {
2317 *status = NFSERR_ACCES;
2318 return;
2321 va.va_type = VLNK;
2322 va.va_mask |= AT_TYPE;
2324 error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2327 * Force new data and metadata out to stable storage.
2329 lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2330 NULL, cr, NULL, NULL, NULL);
2332 if (!lerror) {
2333 (void) VOP_FSYNC(svp, 0, cr, NULL);
2334 VN_RELE(svp);
2338 * Force modified data and metadata out to stable storage.
2340 (void) VOP_FSYNC(vp, 0, cr, NULL);
2342 VN_RELE(vp);
2344 *status = puterrno(error);
2345 if (name != args->sla_tnm)
2346 kmem_free(name, MAXPATHLEN);
2349 void *
2350 rfs_symlink_getfh(struct nfsslargs *args)
2352 return (args->sla_from.da_fhandle);
2356 * Make a directory.
2357 * Create a directory with the given name, parent directory, and attributes.
2358 * Returns a file handle and attributes for the new directory.
2360 void
2361 rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2362 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2364 int error;
2365 struct vattr va;
2366 vnode_t *dvp = NULL;
2367 vnode_t *vp;
2368 char *name = args->ca_da.da_name;
2371 * Disallow NULL paths
2373 if (name == NULL || *name == '\0') {
2374 dr->dr_status = NFSERR_ACCES;
2375 return;
2378 vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2379 if (vp == NULL) {
2380 dr->dr_status = NFSERR_STALE;
2381 return;
2384 if (rdonly(exi, req)) {
2385 VN_RELE(vp);
2386 dr->dr_status = NFSERR_ROFS;
2387 return;
2390 error = sattr_to_vattr(args->ca_sa, &va);
2391 if (error) {
2392 VN_RELE(vp);
2393 dr->dr_status = puterrno(error);
2394 return;
2397 if (!(va.va_mask & AT_MODE)) {
2398 VN_RELE(vp);
2399 dr->dr_status = NFSERR_INVAL;
2400 return;
2403 va.va_type = VDIR;
2404 va.va_mask |= AT_TYPE;
2406 error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2408 if (!error) {
2410 * Attribtutes of the newly created directory should
2411 * be returned to the client.
2413 va.va_mask = AT_ALL; /* We want everything */
2414 error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2416 /* check for overflows */
2417 if (!error) {
2418 acl_perm(vp, exi, &va, cr);
2419 error = vattr_to_nattr(&va, &dr->dr_attr);
2420 if (!error) {
2421 error = makefh(&dr->dr_fhandle, dvp, exi);
2425 * Force new data and metadata out to stable storage.
2427 (void) VOP_FSYNC(dvp, 0, cr, NULL);
2428 VN_RELE(dvp);
2432 * Force modified data and metadata out to stable storage.
2434 (void) VOP_FSYNC(vp, 0, cr, NULL);
2436 VN_RELE(vp);
2438 dr->dr_status = puterrno(error);
2441 void *
2442 rfs_mkdir_getfh(struct nfscreatargs *args)
2444 return (args->ca_da.da_fhandle);
2448 * Remove a directory.
2449 * Remove the given directory name from the given parent directory.
2451 void
2452 rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2453 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2455 int error;
2456 vnode_t *vp;
2460 * Disallow NULL paths
2462 if (da->da_name == NULL || *da->da_name == '\0') {
2463 *status = NFSERR_ACCES;
2464 return;
2467 vp = nfs_fhtovp(da->da_fhandle, exi);
2468 if (vp == NULL) {
2469 *status = NFSERR_STALE;
2470 return;
2473 if (rdonly(exi, req)) {
2474 VN_RELE(vp);
2475 *status = NFSERR_ROFS;
2476 return;
2480 * VOP_RMDIR now takes a new third argument (the current
2481 * directory of the process). That's because someone
2482 * wants to return EINVAL if one tries to remove ".".
2483 * Of course, NFS servers have no idea what their
2484 * clients' current directories are. We fake it by
2485 * supplying a vnode known to exist and illegal to
2486 * remove.
2488 error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2491 * Force modified data and metadata out to stable storage.
2493 (void) VOP_FSYNC(vp, 0, cr, NULL);
2495 VN_RELE(vp);
2498 * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2499 * if the directory is not empty. A System V NFS server
2500 * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2501 * over the wire.
2503 if (error == EEXIST)
2504 *status = NFSERR_NOTEMPTY;
2505 else
2506 *status = puterrno(error);
2509 void *
2510 rfs_rmdir_getfh(struct nfsdiropargs *da)
2512 return (da->da_fhandle);
2515 /* ARGSUSED */
2516 void
2517 rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2518 struct exportinfo *exi, struct svc_req *req, cred_t *cr)
2520 int error;
2521 int iseof;
2522 struct iovec iov;
2523 struct uio uio;
2524 vnode_t *vp;
2525 char *ndata = NULL;
2526 struct sockaddr *ca;
2527 size_t nents;
2528 int ret;
2530 vp = nfs_fhtovp(&rda->rda_fh, exi);
2531 if (vp == NULL) {
2532 rd->rd_entries = NULL;
2533 rd->rd_status = NFSERR_STALE;
2534 return;
2537 if (vp->v_type != VDIR) {
2538 VN_RELE(vp);
2539 rd->rd_entries = NULL;
2540 rd->rd_status = NFSERR_NOTDIR;
2541 return;
2544 (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2546 error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2548 if (error) {
2549 rd->rd_entries = NULL;
2550 goto bad;
2553 if (rda->rda_count == 0) {
2554 rd->rd_entries = NULL;
2555 rd->rd_size = 0;
2556 rd->rd_eof = FALSE;
2557 goto bad;
2560 rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2563 * Allocate data for entries. This will be freed by rfs_rddirfree.
2565 rd->rd_bufsize = (uint_t)rda->rda_count;
2566 rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2569 * Set up io vector to read directory data
2571 iov.iov_base = (caddr_t)rd->rd_entries;
2572 iov.iov_len = rda->rda_count;
2573 uio.uio_iov = &iov;
2574 uio.uio_iovcnt = 1;
2575 uio.uio_segflg = UIO_SYSSPACE;
2576 uio.uio_extflg = UIO_COPY_CACHED;
2577 uio.uio_loffset = (offset_t)rda->rda_offset;
2578 uio.uio_resid = rda->rda_count;
2581 * read directory
2583 error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2586 * Clean up
2588 if (!error) {
2590 * set size and eof
2592 if (uio.uio_resid == rda->rda_count) {
2593 rd->rd_size = 0;
2594 rd->rd_eof = TRUE;
2595 } else {
2596 rd->rd_size = (uint32_t)(rda->rda_count -
2597 uio.uio_resid);
2598 rd->rd_eof = iseof ? TRUE : FALSE;
2602 ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2603 nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2604 ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2605 rda->rda_count, &ndata);
2607 if (ret != 0) {
2608 size_t dropbytes;
2610 * We had to drop one or more entries in order to fit
2611 * during the character conversion. We need to patch
2612 * up the size and eof info.
2614 if (rd->rd_eof)
2615 rd->rd_eof = FALSE;
2616 dropbytes = nfscmd_dropped_entrysize(
2617 (struct dirent64 *)rd->rd_entries, nents, ret);
2618 rd->rd_size -= dropbytes;
2620 if (ndata == NULL) {
2621 ndata = (char *)rd->rd_entries;
2622 } else if (ndata != (char *)rd->rd_entries) {
2623 kmem_free(rd->rd_entries, rd->rd_bufsize);
2624 rd->rd_entries = (void *)ndata;
2625 rd->rd_bufsize = rda->rda_count;
2628 bad:
2629 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2631 #if 0 /* notyet */
2633 * Don't do this. It causes local disk writes when just
2634 * reading the file and the overhead is deemed larger
2635 * than the benefit.
2638 * Force modified metadata out to stable storage.
2640 (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2641 #endif
2643 VN_RELE(vp);
2645 rd->rd_status = puterrno(error);
2648 void *
2649 rfs_readdir_getfh(struct nfsrddirargs *rda)
2651 return (&rda->rda_fh);
2653 void
2654 rfs_rddirfree(struct nfsrddirres *rd)
2656 if (rd->rd_entries != NULL)
2657 kmem_free(rd->rd_entries, rd->rd_bufsize);
2660 /* ARGSUSED */
2661 void
2662 rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2663 struct svc_req *req, cred_t *cr)
2665 int error;
2666 struct statvfs64 sb;
2667 vnode_t *vp;
2669 vp = nfs_fhtovp(fh, exi);
2670 if (vp == NULL) {
2671 fs->fs_status = NFSERR_STALE;
2672 return;
2675 error = VFS_STATVFS(vp->v_vfsp, &sb);
2677 if (!error) {
2678 fs->fs_tsize = nfstsize();
2679 fs->fs_bsize = sb.f_frsize;
2680 fs->fs_blocks = sb.f_blocks;
2681 fs->fs_bfree = sb.f_bfree;
2682 fs->fs_bavail = sb.f_bavail;
2685 VN_RELE(vp);
2687 fs->fs_status = puterrno(error);
2690 void *
2691 rfs_statfs_getfh(fhandle_t *fh)
2693 return (fh);
2696 static int
2697 sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2699 vap->va_mask = 0;
2702 * There was a sign extension bug in some VFS based systems
2703 * which stored the mode as a short. When it would get
2704 * assigned to a u_long, no sign extension would occur.
2705 * It needed to, but this wasn't noticed because sa_mode
2706 * would then get assigned back to the short, thus ignoring
2707 * the upper 16 bits of sa_mode.
2709 * To make this implementation work for both broken
2710 * clients and good clients, we check for both versions
2711 * of the mode.
2713 if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2714 sa->sa_mode != (uint32_t)-1) {
2715 vap->va_mask |= AT_MODE;
2716 vap->va_mode = sa->sa_mode;
2718 if (sa->sa_uid != (uint32_t)-1) {
2719 vap->va_mask |= AT_UID;
2720 vap->va_uid = sa->sa_uid;
2722 if (sa->sa_gid != (uint32_t)-1) {
2723 vap->va_mask |= AT_GID;
2724 vap->va_gid = sa->sa_gid;
2726 if (sa->sa_size != (uint32_t)-1) {
2727 vap->va_mask |= AT_SIZE;
2728 vap->va_size = sa->sa_size;
2730 if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2731 sa->sa_atime.tv_usec != (int32_t)-1) {
2732 #ifndef _LP64
2733 /* return error if time overflow */
2734 if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2735 return (EOVERFLOW);
2736 #endif
2737 vap->va_mask |= AT_ATIME;
2739 * nfs protocol defines times as unsigned so don't extend sign,
2740 * unless sysadmin set nfs_allow_preepoch_time.
2742 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2743 vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2745 if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2746 sa->sa_mtime.tv_usec != (int32_t)-1) {
2747 #ifndef _LP64
2748 /* return error if time overflow */
2749 if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2750 return (EOVERFLOW);
2751 #endif
2752 vap->va_mask |= AT_MTIME;
2754 * nfs protocol defines times as unsigned so don't extend sign,
2755 * unless sysadmin set nfs_allow_preepoch_time.
2757 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2758 vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2760 return (0);
2763 static enum nfsftype vt_to_nf[] = {
2764 0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2768 * check the following fields for overflow: nodeid, size, and time.
2769 * There could be a problem when converting 64-bit LP64 fields
2770 * into 32-bit ones. Return an error if there is an overflow.
2773 vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2775 ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2776 na->na_type = vt_to_nf[vap->va_type];
2778 if (vap->va_mode == (unsigned short) -1)
2779 na->na_mode = (uint32_t)-1;
2780 else
2781 na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2783 if (vap->va_uid == (unsigned short)(-1))
2784 na->na_uid = (uint32_t)(-1);
2785 else if (vap->va_uid == UID_NOBODY)
2786 na->na_uid = (uint32_t)NFS_UID_NOBODY;
2787 else
2788 na->na_uid = vap->va_uid;
2790 if (vap->va_gid == (unsigned short)(-1))
2791 na->na_gid = (uint32_t)-1;
2792 else if (vap->va_gid == GID_NOBODY)
2793 na->na_gid = (uint32_t)NFS_GID_NOBODY;
2794 else
2795 na->na_gid = vap->va_gid;
2798 * Do we need to check fsid for overflow? It is 64-bit in the
2799 * vattr, but are bigger than 32 bit values supported?
2801 na->na_fsid = vap->va_fsid;
2803 na->na_nodeid = vap->va_nodeid;
2806 * Check to make sure that the nodeid is representable over the
2807 * wire without losing bits.
2809 if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2810 return (EFBIG);
2811 na->na_nlink = vap->va_nlink;
2814 * Check for big files here, instead of at the caller. See
2815 * comments in cstat for large special file explanation.
2817 if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2818 if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2819 return (EFBIG);
2820 if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2821 /* UNKNOWN_SIZE | OVERFLOW */
2822 na->na_size = MAXOFF32_T;
2823 } else
2824 na->na_size = vap->va_size;
2825 } else
2826 na->na_size = vap->va_size;
2829 * If the vnode times overflow the 32-bit times that NFS2
2830 * uses on the wire then return an error.
2832 if (!NFS_VAP_TIME_OK(vap)) {
2833 return (EOVERFLOW);
2835 na->na_atime.tv_sec = vap->va_atime.tv_sec;
2836 na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2838 na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2839 na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2841 na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2842 na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2845 * If the dev_t will fit into 16 bits then compress
2846 * it, otherwise leave it alone. See comments in
2847 * nfs_client.c.
2849 if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2850 getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2851 na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2852 else
2853 (void) cmpldev(&na->na_rdev, vap->va_rdev);
2855 na->na_blocks = vap->va_nblocks;
2856 na->na_blocksize = vap->va_blksize;
2859 * This bit of ugliness is a *TEMPORARY* hack to preserve the
2860 * over-the-wire protocols for named-pipe vnodes. It remaps the
2861 * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2863 * BUYER BEWARE:
2864 * If you are porting the NFS to a non-Sun server, you probably
2865 * don't want to include the following block of code. The
2866 * over-the-wire special file types will be changing with the
2867 * NFS Protocol Revision.
2869 if (vap->va_type == VFIFO)
2870 NA_SETFIFO(na);
2871 return (0);
2875 * acl v2 support: returns approximate permission.
2876 * default: returns minimal permission (more restrictive)
2877 * aclok: returns maximal permission (less restrictive)
2878 * This routine changes the permissions that are alaredy in *va.
2879 * If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2880 * CLASS_OBJ is always the same as GROUP_OBJ entry.
2882 static void
2883 acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2885 vsecattr_t vsa;
2886 int aclcnt;
2887 aclent_t *aclentp;
2888 mode_t mask_perm;
2889 mode_t grp_perm;
2890 mode_t other_perm;
2891 mode_t other_orig;
2892 int error;
2894 /* dont care default acl */
2895 vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2896 error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2898 if (!error) {
2899 aclcnt = vsa.vsa_aclcnt;
2900 if (aclcnt > MIN_ACL_ENTRIES) {
2901 /* non-trivial ACL */
2902 aclentp = vsa.vsa_aclentp;
2903 if (exi->exi_export.ex_flags & EX_ACLOK) {
2904 /* maximal permissions */
2905 grp_perm = 0;
2906 other_perm = 0;
2907 for (; aclcnt > 0; aclcnt--, aclentp++) {
2908 switch (aclentp->a_type) {
2909 case USER_OBJ:
2910 break;
2911 case USER:
2912 grp_perm |=
2913 aclentp->a_perm << 3;
2914 other_perm |= aclentp->a_perm;
2915 break;
2916 case GROUP_OBJ:
2917 grp_perm |=
2918 aclentp->a_perm << 3;
2919 break;
2920 case GROUP:
2921 other_perm |= aclentp->a_perm;
2922 break;
2923 case OTHER_OBJ:
2924 other_orig = aclentp->a_perm;
2925 break;
2926 case CLASS_OBJ:
2927 mask_perm = aclentp->a_perm;
2928 break;
2929 default:
2930 break;
2933 grp_perm &= mask_perm << 3;
2934 other_perm &= mask_perm;
2935 other_perm |= other_orig;
2937 } else {
2938 /* minimal permissions */
2939 grp_perm = 070;
2940 other_perm = 07;
2941 for (; aclcnt > 0; aclcnt--, aclentp++) {
2942 switch (aclentp->a_type) {
2943 case USER_OBJ:
2944 break;
2945 case USER:
2946 case CLASS_OBJ:
2947 grp_perm &=
2948 aclentp->a_perm << 3;
2949 other_perm &=
2950 aclentp->a_perm;
2951 break;
2952 case GROUP_OBJ:
2953 grp_perm &=
2954 aclentp->a_perm << 3;
2955 break;
2956 case GROUP:
2957 other_perm &=
2958 aclentp->a_perm;
2959 break;
2960 case OTHER_OBJ:
2961 other_perm &=
2962 aclentp->a_perm;
2963 break;
2964 default:
2965 break;
2969 /* copy to va */
2970 va->va_mode &= ~077;
2971 va->va_mode |= grp_perm | other_perm;
2973 if (vsa.vsa_aclcnt)
2974 kmem_free(vsa.vsa_aclentp,
2975 vsa.vsa_aclcnt * sizeof (aclent_t));
2979 void
2980 rfs_srvrinit(void)
2982 mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2983 nfs2_srv_caller_id = fs_new_caller_id();
2986 void
2987 rfs_srvrfini(void)
2989 mutex_destroy(&rfs_async_write_lock);
2992 static int
2993 rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2995 struct clist *wcl;
2996 int wlist_len;
2997 uint32_t count = rr->rr_count;
2999 wcl = ra->ra_wlist;
3001 if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3002 return (FALSE);
3005 wcl = ra->ra_wlist;
3006 rr->rr_ok.rrok_wlist_len = wlist_len;
3007 rr->rr_ok.rrok_wlist = wcl;
3009 return (TRUE);