1:1 Userland threading stage 2.20/4:
[dragonfly.git] / sys / vfs / nfs / nfs_socket.c
blob5daac501847a14642f80b91d62482feb3c0f3c0d
1 /*
2 * Copyright (c) 1989, 1991, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
36 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
37 * $FreeBSD: src/sys/nfs/nfs_socket.c,v 1.60.2.6 2003/03/26 01:44:46 alfred Exp $
38 * $DragonFly: src/sys/vfs/nfs/nfs_socket.c,v 1.41 2007/02/21 15:46:48 corecode Exp $
42 * Socket operations for use by nfs
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/proc.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
51 #include <sys/mbuf.h>
52 #include <sys/vnode.h>
53 #include <sys/fcntl.h>
54 #include <sys/protosw.h>
55 #include <sys/resourcevar.h>
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/socketops.h>
59 #include <sys/syslog.h>
60 #include <sys/thread.h>
61 #include <sys/tprintf.h>
62 #include <sys/sysctl.h>
63 #include <sys/signalvar.h>
65 #include <netinet/in.h>
66 #include <netinet/tcp.h>
67 #include <sys/thread2.h>
69 #include "rpcv2.h"
70 #include "nfsproto.h"
71 #include "nfs.h"
72 #include "xdr_subs.h"
73 #include "nfsm_subs.h"
74 #include "nfsmount.h"
75 #include "nfsnode.h"
76 #include "nfsrtt.h"
78 #define TRUE 1
79 #define FALSE 0
82 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
83 * Use the mean and mean deviation of rtt for the appropriate type of rpc
84 * for the frequent rpcs and a default for the others.
85 * The justification for doing "other" this way is that these rpcs
86 * happen so infrequently that timer est. would probably be stale.
87 * Also, since many of these rpcs are
88 * non-idempotent, a conservative timeout is desired.
89 * getattr, lookup - A+2D
90 * read, write - A+4D
91 * other - nm_timeo
93 #define NFS_RTO(n, t) \
94 ((t) == 0 ? (n)->nm_timeo : \
95 ((t) < 3 ? \
96 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
97 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
98 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
99 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
101 * External data, mostly RPC constants in XDR form
103 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers,
104 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr,
105 rpc_auth_kerb;
106 extern u_int32_t nfs_prog;
107 extern struct nfsstats nfsstats;
108 extern int nfsv3_procid[NFS_NPROCS];
109 extern int nfs_ticks;
112 * Defines which timer to use for the procnum.
113 * 0 - default
114 * 1 - getattr
115 * 2 - lookup
116 * 3 - read
117 * 4 - write
119 static int proct[NFS_NPROCS] = {
120 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
121 0, 0, 0,
124 static int nfs_realign_test;
125 static int nfs_realign_count;
126 static int nfs_bufpackets = 4;
127 static int nfs_timer_raced;
129 SYSCTL_DECL(_vfs_nfs);
131 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
132 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
133 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
137 * There is a congestion window for outstanding rpcs maintained per mount
138 * point. The cwnd size is adjusted in roughly the way that:
139 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
140 * SIGCOMM '88". ACM, August 1988.
141 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
142 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
143 * of rpcs is in progress.
144 * (The sent count and cwnd are scaled for integer arith.)
145 * Variants of "slow start" were tried and were found to be too much of a
146 * performance hit (ave. rtt 3 times larger),
147 * I suspect due to the large rtt that nfs rpcs have.
149 #define NFS_CWNDSCALE 256
150 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
151 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
152 int nfsrtton = 0;
153 struct nfsrtt nfsrtt;
154 struct callout nfs_timer_handle;
156 static int nfs_msg (struct thread *,char *,char *);
157 static int nfs_rcvlock (struct nfsreq *);
158 static void nfs_rcvunlock (struct nfsreq *);
159 static void nfs_realign (struct mbuf **pm, int hsiz);
160 static int nfs_receive (struct nfsreq *rep, struct sockaddr **aname,
161 struct mbuf **mp);
162 static void nfs_softterm (struct nfsreq *rep);
163 static int nfs_reconnect (struct nfsreq *rep);
164 #ifndef NFS_NOSERVER
165 static int nfsrv_getstream (struct nfssvc_sock *, int, int *);
167 int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd,
168 struct nfssvc_sock *slp,
169 struct thread *td,
170 struct mbuf **mreqp) = {
171 nfsrv_null,
172 nfsrv_getattr,
173 nfsrv_setattr,
174 nfsrv_lookup,
175 nfsrv3_access,
176 nfsrv_readlink,
177 nfsrv_read,
178 nfsrv_write,
179 nfsrv_create,
180 nfsrv_mkdir,
181 nfsrv_symlink,
182 nfsrv_mknod,
183 nfsrv_remove,
184 nfsrv_rmdir,
185 nfsrv_rename,
186 nfsrv_link,
187 nfsrv_readdir,
188 nfsrv_readdirplus,
189 nfsrv_statfs,
190 nfsrv_fsinfo,
191 nfsrv_pathconf,
192 nfsrv_commit,
193 nfsrv_noop,
194 nfsrv_noop,
195 nfsrv_noop,
196 nfsrv_noop
198 #endif /* NFS_NOSERVER */
201 * Initialize sockets and congestion for a new NFS connection.
202 * We do not free the sockaddr if error.
205 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
207 struct socket *so;
208 int error, rcvreserve, sndreserve;
209 int pktscale;
210 struct sockaddr *saddr;
211 struct sockaddr_in *sin;
212 struct thread *td = &thread0; /* only used for socreate and sobind */
214 nmp->nm_so = (struct socket *)0;
215 saddr = nmp->nm_nam;
216 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
217 nmp->nm_soproto, td);
218 if (error)
219 goto bad;
220 so = nmp->nm_so;
221 nmp->nm_soflags = so->so_proto->pr_flags;
224 * Some servers require that the client port be a reserved port number.
226 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
227 struct sockopt sopt;
228 int ip;
229 struct sockaddr_in ssin;
231 bzero(&sopt, sizeof sopt);
232 ip = IP_PORTRANGE_LOW;
233 sopt.sopt_level = IPPROTO_IP;
234 sopt.sopt_name = IP_PORTRANGE;
235 sopt.sopt_val = (void *)&ip;
236 sopt.sopt_valsize = sizeof(ip);
237 sopt.sopt_td = NULL;
238 error = sosetopt(so, &sopt);
239 if (error)
240 goto bad;
241 bzero(&ssin, sizeof ssin);
242 sin = &ssin;
243 sin->sin_len = sizeof (struct sockaddr_in);
244 sin->sin_family = AF_INET;
245 sin->sin_addr.s_addr = INADDR_ANY;
246 sin->sin_port = htons(0);
247 error = sobind(so, (struct sockaddr *)sin, td);
248 if (error)
249 goto bad;
250 bzero(&sopt, sizeof sopt);
251 ip = IP_PORTRANGE_DEFAULT;
252 sopt.sopt_level = IPPROTO_IP;
253 sopt.sopt_name = IP_PORTRANGE;
254 sopt.sopt_val = (void *)&ip;
255 sopt.sopt_valsize = sizeof(ip);
256 sopt.sopt_td = NULL;
257 error = sosetopt(so, &sopt);
258 if (error)
259 goto bad;
263 * Protocols that do not require connections may be optionally left
264 * unconnected for servers that reply from a port other than NFS_PORT.
266 if (nmp->nm_flag & NFSMNT_NOCONN) {
267 if (nmp->nm_soflags & PR_CONNREQUIRED) {
268 error = ENOTCONN;
269 goto bad;
271 } else {
272 error = soconnect(so, nmp->nm_nam, td);
273 if (error)
274 goto bad;
277 * Wait for the connection to complete. Cribbed from the
278 * connect system call but with the wait timing out so
279 * that interruptible mounts don't hang here for a long time.
281 crit_enter();
282 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
283 (void) tsleep((caddr_t)&so->so_timeo, 0,
284 "nfscon", 2 * hz);
285 if ((so->so_state & SS_ISCONNECTING) &&
286 so->so_error == 0 && rep &&
287 (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0){
288 so->so_state &= ~SS_ISCONNECTING;
289 crit_exit();
290 goto bad;
293 if (so->so_error) {
294 error = so->so_error;
295 so->so_error = 0;
296 crit_exit();
297 goto bad;
299 crit_exit();
301 so->so_rcv.sb_timeo = (5 * hz);
302 so->so_snd.sb_timeo = (5 * hz);
305 * Get buffer reservation size from sysctl, but impose reasonable
306 * limits.
308 pktscale = nfs_bufpackets;
309 if (pktscale < 2)
310 pktscale = 2;
311 if (pktscale > 64)
312 pktscale = 64;
314 if (nmp->nm_sotype == SOCK_DGRAM) {
315 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
316 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
317 NFS_MAXPKTHDR) * pktscale;
318 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
319 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
320 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
321 NFS_MAXPKTHDR) * pktscale;
322 } else {
323 if (nmp->nm_sotype != SOCK_STREAM)
324 panic("nfscon sotype");
325 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
326 struct sockopt sopt;
327 int val;
329 bzero(&sopt, sizeof sopt);
330 sopt.sopt_level = SOL_SOCKET;
331 sopt.sopt_name = SO_KEEPALIVE;
332 sopt.sopt_val = &val;
333 sopt.sopt_valsize = sizeof val;
334 val = 1;
335 sosetopt(so, &sopt);
337 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
338 struct sockopt sopt;
339 int val;
341 bzero(&sopt, sizeof sopt);
342 sopt.sopt_level = IPPROTO_TCP;
343 sopt.sopt_name = TCP_NODELAY;
344 sopt.sopt_val = &val;
345 sopt.sopt_valsize = sizeof val;
346 val = 1;
347 sosetopt(so, &sopt);
349 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
350 sizeof (u_int32_t)) * pktscale;
351 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
352 sizeof (u_int32_t)) * pktscale;
354 error = soreserve(so, sndreserve, rcvreserve,
355 &td->td_proc->p_rlimit[RLIMIT_SBSIZE]);
356 if (error)
357 goto bad;
358 so->so_rcv.sb_flags |= SB_NOINTR;
359 so->so_snd.sb_flags |= SB_NOINTR;
361 /* Initialize other non-zero congestion variables */
362 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
363 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
364 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
365 nmp->nm_sdrtt[3] = 0;
366 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
367 nmp->nm_sent = 0;
368 nmp->nm_timeouts = 0;
369 return (0);
371 bad:
372 nfs_disconnect(nmp);
373 return (error);
377 * Reconnect routine:
378 * Called when a connection is broken on a reliable protocol.
379 * - clean up the old socket
380 * - nfs_connect() again
381 * - set R_MUSTRESEND for all outstanding requests on mount point
382 * If this fails the mount point is DEAD!
383 * nb: Must be called with the nfs_sndlock() set on the mount point.
385 static int
386 nfs_reconnect(struct nfsreq *rep)
388 struct nfsreq *rp;
389 struct nfsmount *nmp = rep->r_nmp;
390 int error;
392 nfs_disconnect(nmp);
393 while ((error = nfs_connect(nmp, rep)) != 0) {
394 if (error == EINTR || error == ERESTART)
395 return (EINTR);
396 (void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0);
400 * Loop through outstanding request list and fix up all requests
401 * on old socket.
403 crit_enter();
404 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
405 if (rp->r_nmp == nmp)
406 rp->r_flags |= R_MUSTRESEND;
408 crit_exit();
409 return (0);
413 * NFS disconnect. Clean up and unlink.
415 void
416 nfs_disconnect(struct nfsmount *nmp)
418 struct socket *so;
420 if (nmp->nm_so) {
421 so = nmp->nm_so;
422 nmp->nm_so = (struct socket *)0;
423 soshutdown(so, 2);
424 soclose(so, FNONBLOCK);
428 void
429 nfs_safedisconnect(struct nfsmount *nmp)
431 struct nfsreq dummyreq;
433 bzero(&dummyreq, sizeof(dummyreq));
434 dummyreq.r_nmp = nmp;
435 dummyreq.r_td = NULL;
436 nfs_rcvlock(&dummyreq);
437 nfs_disconnect(nmp);
438 nfs_rcvunlock(&dummyreq);
442 * This is the nfs send routine. For connection based socket types, it
443 * must be called with an nfs_sndlock() on the socket.
444 * "rep == NULL" indicates that it has been called from a server.
445 * For the client side:
446 * - return EINTR if the RPC is terminated, 0 otherwise
447 * - set R_MUSTRESEND if the send fails for any reason
448 * - do any cleanup required by recoverable socket errors (?)
449 * For the server side:
450 * - return EINTR or ERESTART if interrupted by a signal
451 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
452 * - do any cleanup required by recoverable socket errors (?)
455 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
456 struct nfsreq *rep)
458 struct sockaddr *sendnam;
459 int error, soflags, flags;
461 if (rep) {
462 if (rep->r_flags & R_SOFTTERM) {
463 m_freem(top);
464 return (EINTR);
466 if ((so = rep->r_nmp->nm_so) == NULL) {
467 rep->r_flags |= R_MUSTRESEND;
468 m_freem(top);
469 return (0);
471 rep->r_flags &= ~R_MUSTRESEND;
472 soflags = rep->r_nmp->nm_soflags;
473 } else
474 soflags = so->so_proto->pr_flags;
475 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
476 sendnam = (struct sockaddr *)0;
477 else
478 sendnam = nam;
479 if (so->so_type == SOCK_SEQPACKET)
480 flags = MSG_EOR;
481 else
482 flags = 0;
484 error = so_pru_sosend(so, sendnam, NULL, top, NULL, flags,
485 curthread /*XXX*/);
487 * ENOBUFS for dgram sockets is transient and non fatal.
488 * No need to log, and no need to break a soft mount.
490 if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
491 error = 0;
492 if (rep) /* do backoff retransmit on client */
493 rep->r_flags |= R_MUSTRESEND;
496 if (error) {
497 if (rep) {
498 log(LOG_INFO, "nfs send error %d for server %s\n",error,
499 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
501 * Deal with errors for the client side.
503 if (rep->r_flags & R_SOFTTERM)
504 error = EINTR;
505 else
506 rep->r_flags |= R_MUSTRESEND;
507 } else
508 log(LOG_INFO, "nfsd send error %d\n", error);
511 * Handle any recoverable (soft) socket errors here. (?)
513 if (error != EINTR && error != ERESTART &&
514 error != EWOULDBLOCK && error != EPIPE)
515 error = 0;
517 return (error);
521 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
522 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
523 * Mark and consolidate the data into a new mbuf list.
524 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
525 * small mbufs.
526 * For SOCK_STREAM we must be very careful to read an entire record once
527 * we have read any of it, even if the system call has been interrupted.
529 static int
530 nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
532 struct socket *so;
533 struct uio auio;
534 struct iovec aio;
535 struct mbuf *m;
536 struct mbuf *control;
537 u_int32_t len;
538 struct sockaddr **getnam;
539 int error, sotype, rcvflg;
540 struct thread *td = curthread; /* XXX */
543 * Set up arguments for soreceive()
545 *mp = (struct mbuf *)0;
546 *aname = (struct sockaddr *)0;
547 sotype = rep->r_nmp->nm_sotype;
550 * For reliable protocols, lock against other senders/receivers
551 * in case a reconnect is necessary.
552 * For SOCK_STREAM, first get the Record Mark to find out how much
553 * more there is to get.
554 * We must lock the socket against other receivers
555 * until we have an entire rpc request/reply.
557 if (sotype != SOCK_DGRAM) {
558 error = nfs_sndlock(rep);
559 if (error)
560 return (error);
561 tryagain:
563 * Check for fatal errors and resending request.
566 * Ugh: If a reconnect attempt just happened, nm_so
567 * would have changed. NULL indicates a failed
568 * attempt that has essentially shut down this
569 * mount point.
571 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
572 nfs_sndunlock(rep);
573 return (EINTR);
575 so = rep->r_nmp->nm_so;
576 if (!so) {
577 error = nfs_reconnect(rep);
578 if (error) {
579 nfs_sndunlock(rep);
580 return (error);
582 goto tryagain;
584 while (rep->r_flags & R_MUSTRESEND) {
585 m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
586 nfsstats.rpcretries++;
587 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
588 if (error) {
589 if (error == EINTR || error == ERESTART ||
590 (error = nfs_reconnect(rep)) != 0) {
591 nfs_sndunlock(rep);
592 return (error);
594 goto tryagain;
597 nfs_sndunlock(rep);
598 if (sotype == SOCK_STREAM) {
599 aio.iov_base = (caddr_t) &len;
600 aio.iov_len = sizeof(u_int32_t);
601 auio.uio_iov = &aio;
602 auio.uio_iovcnt = 1;
603 auio.uio_segflg = UIO_SYSSPACE;
604 auio.uio_rw = UIO_READ;
605 auio.uio_offset = 0;
606 auio.uio_resid = sizeof(u_int32_t);
607 auio.uio_td = td;
608 do {
609 rcvflg = MSG_WAITALL;
610 error = so_pru_soreceive(so, NULL, &auio, NULL,
611 NULL, &rcvflg);
612 if (error == EWOULDBLOCK && rep) {
613 if (rep->r_flags & R_SOFTTERM)
614 return (EINTR);
616 } while (error == EWOULDBLOCK);
617 if (!error && auio.uio_resid > 0) {
619 * Don't log a 0 byte receive; it means
620 * that the socket has been closed, and
621 * can happen during normal operation
622 * (forcible unmount or Solaris server).
624 if (auio.uio_resid != sizeof (u_int32_t))
625 log(LOG_INFO,
626 "short receive (%d/%d) from nfs server %s\n",
627 (int)(sizeof(u_int32_t) - auio.uio_resid),
628 (int)sizeof(u_int32_t),
629 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
630 error = EPIPE;
632 if (error)
633 goto errout;
634 len = ntohl(len) & ~0x80000000;
636 * This is SERIOUS! We are out of sync with the sender
637 * and forcing a disconnect/reconnect is all I can do.
639 if (len > NFS_MAXPACKET) {
640 log(LOG_ERR, "%s (%d) from nfs server %s\n",
641 "impossible packet length",
642 len,
643 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
644 error = EFBIG;
645 goto errout;
647 auio.uio_resid = len;
648 do {
649 rcvflg = MSG_WAITALL;
650 error = so_pru_soreceive(so, NULL, &auio, mp,
651 NULL, &rcvflg);
652 } while (error == EWOULDBLOCK || error == EINTR ||
653 error == ERESTART);
654 if (!error && auio.uio_resid > 0) {
655 if (len != auio.uio_resid)
656 log(LOG_INFO,
657 "short receive (%d/%d) from nfs server %s\n",
658 len - auio.uio_resid, len,
659 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
660 error = EPIPE;
662 } else {
664 * NB: Since uio_resid is big, MSG_WAITALL is ignored
665 * and soreceive() will return when it has either a
666 * control msg or a data msg.
667 * We have no use for control msg., but must grab them
668 * and then throw them away so we know what is going
669 * on.
671 auio.uio_resid = len = 100000000; /* Anything Big */
672 auio.uio_td = td;
673 do {
674 rcvflg = 0;
675 error = so_pru_soreceive(so, NULL, &auio, mp,
676 &control, &rcvflg);
677 if (control)
678 m_freem(control);
679 if (error == EWOULDBLOCK && rep) {
680 if (rep->r_flags & R_SOFTTERM)
681 return (EINTR);
683 } while (error == EWOULDBLOCK ||
684 (!error && *mp == NULL && control));
685 if ((rcvflg & MSG_EOR) == 0)
686 kprintf("Egad!!\n");
687 if (!error && *mp == NULL)
688 error = EPIPE;
689 len -= auio.uio_resid;
691 errout:
692 if (error && error != EINTR && error != ERESTART) {
693 m_freem(*mp);
694 *mp = (struct mbuf *)0;
695 if (error != EPIPE)
696 log(LOG_INFO,
697 "receive error %d from nfs server %s\n",
698 error,
699 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
700 error = nfs_sndlock(rep);
701 if (!error) {
702 error = nfs_reconnect(rep);
703 if (!error)
704 goto tryagain;
705 else
706 nfs_sndunlock(rep);
709 } else {
710 if ((so = rep->r_nmp->nm_so) == NULL)
711 return (EACCES);
712 if (so->so_state & SS_ISCONNECTED)
713 getnam = (struct sockaddr **)0;
714 else
715 getnam = aname;
716 auio.uio_resid = len = 1000000;
717 auio.uio_td = td;
718 do {
719 rcvflg = 0;
720 error = so_pru_soreceive(so, getnam, &auio, mp, NULL,
721 &rcvflg);
722 if (error == EWOULDBLOCK &&
723 (rep->r_flags & R_SOFTTERM))
724 return (EINTR);
725 } while (error == EWOULDBLOCK);
726 len -= auio.uio_resid;
728 if (error) {
729 m_freem(*mp);
730 *mp = (struct mbuf *)0;
733 * Search for any mbufs that are not a multiple of 4 bytes long
734 * or with m_data not longword aligned.
735 * These could cause pointer alignment problems, so copy them to
736 * well aligned mbufs.
738 nfs_realign(mp, 5 * NFSX_UNSIGNED);
739 return (error);
743 * Implement receipt of reply on a socket.
744 * We must search through the list of received datagrams matching them
745 * with outstanding requests using the xid, until ours is found.
747 /* ARGSUSED */
749 nfs_reply(struct nfsreq *myrep)
751 struct nfsreq *rep;
752 struct nfsmount *nmp = myrep->r_nmp;
753 int32_t t1;
754 struct mbuf *mrep, *md;
755 struct sockaddr *nam;
756 u_int32_t rxid, *tl;
757 caddr_t dpos, cp2;
758 int error;
761 * Loop around until we get our own reply
763 for (;;) {
765 * Lock against other receivers so that I don't get stuck in
766 * sbwait() after someone else has received my reply for me.
767 * Also necessary for connection based protocols to avoid
768 * race conditions during a reconnect.
769 * If nfs_rcvlock() returns EALREADY, that means that
770 * the reply has already been recieved by another
771 * process and we can return immediately. In this
772 * case, the lock is not taken to avoid races with
773 * other processes.
775 error = nfs_rcvlock(myrep);
776 if (error == EALREADY)
777 return (0);
778 if (error)
779 return (error);
781 * Get the next Rpc reply off the socket
783 error = nfs_receive(myrep, &nam, &mrep);
784 nfs_rcvunlock(myrep);
785 if (error) {
787 * Ignore routing errors on connectionless protocols??
789 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
790 nmp->nm_so->so_error = 0;
791 if (myrep->r_flags & R_GETONEREP)
792 return (0);
793 continue;
795 return (error);
797 if (nam)
798 FREE(nam, M_SONAME);
801 * Get the xid and check that it is an rpc reply
803 md = mrep;
804 dpos = mtod(md, caddr_t);
805 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED);
806 rxid = *tl++;
807 if (*tl != rpc_reply) {
808 nfsstats.rpcinvalid++;
809 m_freem(mrep);
810 nfsmout:
811 if (myrep->r_flags & R_GETONEREP)
812 return (0);
813 continue;
817 * Loop through the request list to match up the reply
818 * Iff no match, just drop the datagram. On match, set
819 * r_mrep atomically to prevent the timer from messing
820 * around with the request after we have exited the critical
821 * section.
823 crit_enter();
824 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
825 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
826 rep->r_mrep = mrep;
827 break;
830 crit_exit();
833 * Fill in the rest of the reply if we found a match.
835 if (rep) {
836 rep->r_md = md;
837 rep->r_dpos = dpos;
838 if (nfsrtton) {
839 struct rttl *rt;
841 rt = &nfsrtt.rttl[nfsrtt.pos];
842 rt->proc = rep->r_procnum;
843 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
844 rt->sent = nmp->nm_sent;
845 rt->cwnd = nmp->nm_cwnd;
846 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
847 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
848 rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid;
849 getmicrotime(&rt->tstamp);
850 if (rep->r_flags & R_TIMING)
851 rt->rtt = rep->r_rtt;
852 else
853 rt->rtt = 1000000;
854 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
857 * Update congestion window.
858 * Do the additive increase of
859 * one rpc/rtt.
861 if (nmp->nm_cwnd <= nmp->nm_sent) {
862 nmp->nm_cwnd +=
863 (NFS_CWNDSCALE * NFS_CWNDSCALE +
864 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
865 if (nmp->nm_cwnd > NFS_MAXCWND)
866 nmp->nm_cwnd = NFS_MAXCWND;
868 crit_enter(); /* nfs_timer interlock for nm_sent */
869 if (rep->r_flags & R_SENT) {
870 rep->r_flags &= ~R_SENT;
871 nmp->nm_sent -= NFS_CWNDSCALE;
873 crit_exit();
875 * Update rtt using a gain of 0.125 on the mean
876 * and a gain of 0.25 on the deviation.
878 if (rep->r_flags & R_TIMING) {
880 * Since the timer resolution of
881 * NFS_HZ is so course, it can often
882 * result in r_rtt == 0. Since
883 * r_rtt == N means that the actual
884 * rtt is between N+dt and N+2-dt ticks,
885 * add 1.
887 t1 = rep->r_rtt + 1;
888 t1 -= (NFS_SRTT(rep) >> 3);
889 NFS_SRTT(rep) += t1;
890 if (t1 < 0)
891 t1 = -t1;
892 t1 -= (NFS_SDRTT(rep) >> 2);
893 NFS_SDRTT(rep) += t1;
895 nmp->nm_timeouts = 0;
898 * If not matched to a request, drop it.
899 * If it's mine, get out.
901 if (rep == NULL) {
902 nfsstats.rpcunexpected++;
903 m_freem(mrep);
904 } else if (rep == myrep) {
905 if (rep->r_mrep == NULL)
906 panic("nfsreply nil");
907 return (0);
909 if (myrep->r_flags & R_GETONEREP)
910 return (0);
915 * nfs_request - goes something like this
916 * - fill in request struct
917 * - links it into list
918 * - calls nfs_send() for first transmit
919 * - calls nfs_receive() to get reply
920 * - break down rpc header and return with nfs reply pointed to
921 * by mrep or error
922 * nb: always frees up mreq mbuf list
925 nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
926 struct thread *td, struct ucred *cred, struct mbuf **mrp,
927 struct mbuf **mdp, caddr_t *dposp)
929 struct mbuf *mrep, *m2;
930 struct nfsreq *rep;
931 u_int32_t *tl;
932 int i;
933 struct nfsmount *nmp;
934 struct mbuf *m, *md, *mheadend;
935 char nickv[RPCX_NICKVERF];
936 time_t waituntil;
937 caddr_t dpos, cp2;
938 int t1, error = 0, mrest_len, auth_len, auth_type;
939 int trylater_delay = 15, trylater_cnt = 0, failed_auth = 0;
940 int verf_len, verf_type;
941 u_int32_t xid;
942 char *auth_str, *verf_str;
943 NFSKERBKEY_T key; /* save session key */
945 /* Reject requests while attempting a forced unmount. */
946 if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
947 m_freem(mrest);
948 return (ESTALE);
950 nmp = VFSTONFS(vp->v_mount);
951 MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
952 rep->r_nmp = nmp;
953 rep->r_vp = vp;
954 rep->r_td = td;
955 rep->r_procnum = procnum;
956 rep->r_mreq = NULL;
957 i = 0;
958 m = mrest;
959 while (m) {
960 i += m->m_len;
961 m = m->m_next;
963 mrest_len = i;
966 * Get the RPC header with authorization.
968 kerbauth:
969 verf_str = auth_str = (char *)0;
970 if (nmp->nm_flag & NFSMNT_KERB) {
971 verf_str = nickv;
972 verf_len = sizeof (nickv);
973 auth_type = RPCAUTH_KERB4;
974 bzero((caddr_t)key, sizeof (key));
975 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
976 &auth_len, verf_str, verf_len)) {
977 error = nfs_getauth(nmp, rep, cred, &auth_str,
978 &auth_len, verf_str, &verf_len, key);
979 if (error) {
980 kfree((caddr_t)rep, M_NFSREQ);
981 m_freem(mrest);
982 return (error);
985 } else {
986 auth_type = RPCAUTH_UNIX;
987 if (cred->cr_ngroups < 1)
988 panic("nfsreq nogrps");
989 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
990 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
991 5 * NFSX_UNSIGNED;
993 m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
994 auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid);
995 if (auth_str)
996 kfree(auth_str, M_TEMP);
999 * For stream protocols, insert a Sun RPC Record Mark.
1001 if (nmp->nm_sotype == SOCK_STREAM) {
1002 M_PREPEND(m, NFSX_UNSIGNED, MB_WAIT);
1003 if (m == NULL) {
1004 kfree(rep, M_NFSREQ);
1005 return (ENOBUFS);
1007 *mtod(m, u_int32_t *) = htonl(0x80000000 |
1008 (m->m_pkthdr.len - NFSX_UNSIGNED));
1010 rep->r_mreq = m;
1011 rep->r_xid = xid;
1012 tryagain:
1013 if (nmp->nm_flag & NFSMNT_SOFT)
1014 rep->r_retry = nmp->nm_retry;
1015 else
1016 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
1017 rep->r_rtt = rep->r_rexmit = 0;
1018 if (proct[procnum] > 0)
1019 rep->r_flags = R_TIMING | R_MASKTIMER;
1020 else
1021 rep->r_flags = R_MASKTIMER;
1022 rep->r_mrep = NULL;
1025 * Do the client side RPC.
1027 nfsstats.rpcrequests++;
1030 * Chain request into list of outstanding requests. Be sure
1031 * to put it LAST so timer finds oldest requests first. Note
1032 * that R_MASKTIMER is set at the moment to prevent any timer
1033 * action on this request while we are still doing processing on
1034 * it below. splsoftclock() primarily protects nm_sent. Note
1035 * that we may block in this code so there is no atomicy guarentee.
1037 crit_enter();
1038 TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1041 * If backing off another request or avoiding congestion, don't
1042 * send this one now but let timer do it. If not timing a request,
1043 * do it now.
1045 * Even though the timer will not mess with our request there is
1046 * still the possibility that we will race a reply (which clears
1047 * R_SENT), especially on localhost connections, so be very careful
1048 * when setting R_SENT. We could set R_SENT prior to calling
1049 * nfs_send() but why bother if the response occurs that quickly?
1051 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1052 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1053 nmp->nm_sent < nmp->nm_cwnd)) {
1054 if (nmp->nm_soflags & PR_CONNREQUIRED)
1055 error = nfs_sndlock(rep);
1056 if (!error) {
1057 m2 = m_copym(m, 0, M_COPYALL, MB_WAIT);
1058 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1059 if (nmp->nm_soflags & PR_CONNREQUIRED)
1060 nfs_sndunlock(rep);
1062 if (!error && (rep->r_flags & R_MUSTRESEND) == 0 &&
1063 rep->r_mrep == NULL) {
1064 KASSERT((rep->r_flags & R_SENT) == 0,
1065 ("R_SENT ASSERT %p", rep));
1066 nmp->nm_sent += NFS_CWNDSCALE;
1067 rep->r_flags |= R_SENT;
1069 } else {
1070 rep->r_rtt = -1;
1074 * Let the timer do what it will with the request, then
1075 * wait for the reply from our send or the timer's.
1077 if (!error || error == EPIPE) {
1078 rep->r_flags &= ~R_MASKTIMER;
1079 crit_exit();
1080 error = nfs_reply(rep);
1081 crit_enter();
1085 * RPC done, unlink the request, but don't rip it out from under
1086 * the callout timer.
1088 while (rep->r_flags & R_LOCKED) {
1089 nfs_timer_raced = 1;
1090 tsleep(&nfs_timer_raced, 0, "nfstrac", 0);
1092 TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1095 * Decrement the outstanding request count.
1097 if (rep->r_flags & R_SENT) {
1098 rep->r_flags &= ~R_SENT;
1099 nmp->nm_sent -= NFS_CWNDSCALE;
1101 crit_exit();
1104 * If there was a successful reply and a tprintf msg.
1105 * tprintf a response.
1107 if (!error && (rep->r_flags & R_TPRINTFMSG))
1108 nfs_msg(rep->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
1109 "is alive again");
1110 mrep = rep->r_mrep;
1111 md = rep->r_md;
1112 dpos = rep->r_dpos;
1113 if (error) {
1114 m_freem(rep->r_mreq);
1115 kfree((caddr_t)rep, M_NFSREQ);
1116 return (error);
1120 * break down the rpc header and check if ok
1122 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1123 if (*tl++ == rpc_msgdenied) {
1124 if (*tl == rpc_mismatch)
1125 error = EOPNOTSUPP;
1126 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1127 if (!failed_auth) {
1128 failed_auth++;
1129 mheadend->m_next = (struct mbuf *)0;
1130 m_freem(mrep);
1131 m_freem(rep->r_mreq);
1132 goto kerbauth;
1133 } else
1134 error = EAUTH;
1135 } else
1136 error = EACCES;
1137 m_freem(mrep);
1138 m_freem(rep->r_mreq);
1139 kfree((caddr_t)rep, M_NFSREQ);
1140 return (error);
1144 * Grab any Kerberos verifier, otherwise just throw it away.
1146 verf_type = fxdr_unsigned(int, *tl++);
1147 i = fxdr_unsigned(int32_t, *tl);
1148 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1149 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1150 if (error)
1151 goto nfsmout;
1152 } else if (i > 0)
1153 nfsm_adv(nfsm_rndup(i));
1154 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1155 /* 0 == ok */
1156 if (*tl == 0) {
1157 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1158 if (*tl != 0) {
1159 error = fxdr_unsigned(int, *tl);
1160 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1161 error == NFSERR_TRYLATER) {
1162 m_freem(mrep);
1163 error = 0;
1164 waituntil = time_second + trylater_delay;
1165 while (time_second < waituntil)
1166 (void) tsleep((caddr_t)&lbolt,
1167 0, "nqnfstry", 0);
1168 trylater_delay *= nfs_backoff[trylater_cnt];
1169 if (trylater_cnt < 7)
1170 trylater_cnt++;
1171 goto tryagain;
1175 * If the File Handle was stale, invalidate the
1176 * lookup cache, just in case.
1178 * To avoid namecache<->vnode deadlocks we must
1179 * release the vnode lock if we hold it.
1181 if (error == ESTALE) {
1182 int ltype;
1184 ltype = lockstatus(&vp->v_lock, curthread);
1185 if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED)
1186 lockmgr(&vp->v_lock, LK_RELEASE);
1187 cache_inval_vp(vp, CINV_CHILDREN);
1188 if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED)
1189 lockmgr(&vp->v_lock, ltype);
1191 if (nmp->nm_flag & NFSMNT_NFSV3) {
1192 *mrp = mrep;
1193 *mdp = md;
1194 *dposp = dpos;
1195 error |= NFSERR_RETERR;
1196 } else
1197 m_freem(mrep);
1198 m_freem(rep->r_mreq);
1199 kfree((caddr_t)rep, M_NFSREQ);
1200 return (error);
1203 *mrp = mrep;
1204 *mdp = md;
1205 *dposp = dpos;
1206 m_freem(rep->r_mreq);
1207 FREE((caddr_t)rep, M_NFSREQ);
1208 return (0);
1210 m_freem(mrep);
1211 error = EPROTONOSUPPORT;
1212 nfsmout:
1213 m_freem(rep->r_mreq);
1214 kfree((caddr_t)rep, M_NFSREQ);
1215 return (error);
1218 #ifndef NFS_NOSERVER
1220 * Generate the rpc reply header
1221 * siz arg. is used to decide if adding a cluster is worthwhile
1224 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
1225 int err, struct mbuf **mrq, struct mbuf **mbp, caddr_t *bposp)
1227 u_int32_t *tl;
1228 struct mbuf *mreq;
1229 caddr_t bpos;
1230 struct mbuf *mb, *mb2;
1232 siz += RPC_REPLYSIZ;
1233 mb = mreq = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL);
1234 mreq->m_pkthdr.len = 0;
1236 * If this is not a cluster, try and leave leading space
1237 * for the lower level headers.
1239 if ((max_hdr + siz) < MINCLSIZE)
1240 mreq->m_data += max_hdr;
1241 tl = mtod(mreq, u_int32_t *);
1242 mreq->m_len = 6 * NFSX_UNSIGNED;
1243 bpos = ((caddr_t)tl) + mreq->m_len;
1244 *tl++ = txdr_unsigned(nd->nd_retxid);
1245 *tl++ = rpc_reply;
1246 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1247 *tl++ = rpc_msgdenied;
1248 if (err & NFSERR_AUTHERR) {
1249 *tl++ = rpc_autherr;
1250 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1251 mreq->m_len -= NFSX_UNSIGNED;
1252 bpos -= NFSX_UNSIGNED;
1253 } else {
1254 *tl++ = rpc_mismatch;
1255 *tl++ = txdr_unsigned(RPC_VER2);
1256 *tl = txdr_unsigned(RPC_VER2);
1258 } else {
1259 *tl++ = rpc_msgaccepted;
1262 * For Kerberos authentication, we must send the nickname
1263 * verifier back, otherwise just RPCAUTH_NULL.
1265 if (nd->nd_flag & ND_KERBFULL) {
1266 struct nfsuid *nuidp;
1267 struct timeval ktvin, ktvout;
1269 for (nuidp = NUIDHASH(slp, nd->nd_cr.cr_uid)->lh_first;
1270 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1271 if (nuidp->nu_cr.cr_uid == nd->nd_cr.cr_uid &&
1272 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1273 &nuidp->nu_haddr, nd->nd_nam2)))
1274 break;
1276 if (nuidp) {
1277 ktvin.tv_sec =
1278 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1279 ktvin.tv_usec =
1280 txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1283 * Encrypt the timestamp in ecb mode using the
1284 * session key.
1286 #ifdef NFSKERB
1288 #endif
1290 *tl++ = rpc_auth_kerb;
1291 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1292 *tl = ktvout.tv_sec;
1293 nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1294 *tl++ = ktvout.tv_usec;
1295 *tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid);
1296 } else {
1297 *tl++ = 0;
1298 *tl++ = 0;
1300 } else {
1301 *tl++ = 0;
1302 *tl++ = 0;
1304 switch (err) {
1305 case EPROGUNAVAIL:
1306 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1307 break;
1308 case EPROGMISMATCH:
1309 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1310 nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1311 *tl++ = txdr_unsigned(2);
1312 *tl = txdr_unsigned(3);
1313 break;
1314 case EPROCUNAVAIL:
1315 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1316 break;
1317 case EBADRPC:
1318 *tl = txdr_unsigned(RPC_GARBAGE);
1319 break;
1320 default:
1321 *tl = 0;
1322 if (err != NFSERR_RETVOID) {
1323 nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED);
1324 if (err)
1325 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1326 else
1327 *tl = 0;
1329 break;
1333 if (mrq != NULL)
1334 *mrq = mreq;
1335 *mbp = mb;
1336 *bposp = bpos;
1337 if (err != 0 && err != NFSERR_RETVOID)
1338 nfsstats.srvrpc_errs++;
1339 return (0);
1343 #endif /* NFS_NOSERVER */
1345 * Nfs timer routine
1346 * Scan the nfsreq list and retranmit any requests that have timed out
1347 * To avoid retransmission attempts on STREAM sockets (in the future) make
1348 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1350 void
1351 nfs_timer(void *arg /* never used */)
1353 struct nfsreq *rep;
1354 struct mbuf *m;
1355 struct socket *so;
1356 struct nfsmount *nmp;
1357 int timeo;
1358 int error;
1359 #ifndef NFS_NOSERVER
1360 struct nfssvc_sock *slp;
1361 u_quad_t cur_usec;
1362 #endif /* NFS_NOSERVER */
1363 struct thread *td = &thread0; /* XXX for credentials, will break if sleep */
1365 crit_enter();
1366 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1367 nmp = rep->r_nmp;
1368 if (rep->r_mrep || (rep->r_flags & (R_SOFTTERM|R_MASKTIMER)))
1369 continue;
1370 rep->r_flags |= R_LOCKED;
1371 if (nfs_sigintr(nmp, rep, rep->r_td)) {
1372 nfs_softterm(rep);
1373 goto skip;
1375 if (rep->r_rtt >= 0) {
1376 rep->r_rtt++;
1377 if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1378 timeo = nmp->nm_timeo;
1379 else
1380 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1381 if (nmp->nm_timeouts > 0)
1382 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1383 if (rep->r_rtt <= timeo)
1384 goto skip;
1385 if (nmp->nm_timeouts < 8)
1386 nmp->nm_timeouts++;
1389 * Check for server not responding
1391 if ((rep->r_flags & R_TPRINTFMSG) == 0 &&
1392 rep->r_rexmit > nmp->nm_deadthresh) {
1393 nfs_msg(rep->r_td,
1394 nmp->nm_mountp->mnt_stat.f_mntfromname,
1395 "not responding");
1396 rep->r_flags |= R_TPRINTFMSG;
1398 if (rep->r_rexmit >= rep->r_retry) { /* too many */
1399 nfsstats.rpctimeouts++;
1400 nfs_softterm(rep);
1401 goto skip;
1403 if (nmp->nm_sotype != SOCK_DGRAM) {
1404 if (++rep->r_rexmit > NFS_MAXREXMIT)
1405 rep->r_rexmit = NFS_MAXREXMIT;
1406 goto skip;
1408 if ((so = nmp->nm_so) == NULL)
1409 goto skip;
1412 * If there is enough space and the window allows..
1413 * Resend it
1414 * Set r_rtt to -1 in case we fail to send it now.
1416 rep->r_rtt = -1;
1417 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
1418 ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1419 (rep->r_flags & R_SENT) ||
1420 nmp->nm_sent < nmp->nm_cwnd) &&
1421 (m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){
1422 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
1423 error = so_pru_send(so, 0, m, (struct sockaddr *)0,
1424 (struct mbuf *)0, td);
1425 else
1426 error = so_pru_send(so, 0, m, nmp->nm_nam,
1427 (struct mbuf *)0, td);
1428 if (error) {
1429 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
1430 so->so_error = 0;
1431 } else if (rep->r_mrep == NULL) {
1433 * Iff first send, start timing
1434 * else turn timing off, backoff timer
1435 * and divide congestion window by 2.
1437 * It is possible for the so_pru_send() to
1438 * block and for us to race a reply so we
1439 * only do this if the reply field has not
1440 * been filled in. R_LOCKED will prevent
1441 * the request from being ripped out from under
1442 * us entirely.
1444 if (rep->r_flags & R_SENT) {
1445 rep->r_flags &= ~R_TIMING;
1446 if (++rep->r_rexmit > NFS_MAXREXMIT)
1447 rep->r_rexmit = NFS_MAXREXMIT;
1448 nmp->nm_cwnd >>= 1;
1449 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1450 nmp->nm_cwnd = NFS_CWNDSCALE;
1451 nfsstats.rpcretries++;
1452 } else {
1453 rep->r_flags |= R_SENT;
1454 nmp->nm_sent += NFS_CWNDSCALE;
1456 rep->r_rtt = 0;
1459 skip:
1460 rep->r_flags &= ~R_LOCKED;
1462 #ifndef NFS_NOSERVER
1465 * Scan the write gathering queues for writes that need to be
1466 * completed now.
1468 cur_usec = nfs_curusec();
1469 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1470 if (slp->ns_tq.lh_first && slp->ns_tq.lh_first->nd_time<=cur_usec)
1471 nfsrv_wakenfsd(slp, 1);
1473 #endif /* NFS_NOSERVER */
1476 * Due to possible blocking, a client operation may be waiting for
1477 * us to finish processing this request so it can remove it.
1479 if (nfs_timer_raced) {
1480 nfs_timer_raced = 0;
1481 wakeup(&nfs_timer_raced);
1483 crit_exit();
1484 callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer, NULL);
1488 * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1489 * wait for all requests to complete. This is used by forced unmounts
1490 * to terminate any outstanding RPCs.
1493 nfs_nmcancelreqs(struct nfsmount *nmp)
1495 struct nfsreq *req;
1496 int i;
1498 crit_enter();
1499 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1500 if (nmp != req->r_nmp || req->r_mrep != NULL ||
1501 (req->r_flags & R_SOFTTERM)) {
1502 continue;
1504 nfs_softterm(req);
1506 crit_exit();
1508 for (i = 0; i < 30; i++) {
1509 crit_enter();
1510 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1511 if (nmp == req->r_nmp)
1512 break;
1514 crit_exit();
1515 if (req == NULL)
1516 return (0);
1517 tsleep(&lbolt, 0, "nfscancel", 0);
1519 return (EBUSY);
1523 * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
1524 * The nm_send count is decremented now to avoid deadlocks when the process in
1525 * soreceive() hasn't yet managed to send its own request.
1527 * This routine must be called at splsoftclock() to protect r_flags and
1528 * nm_sent.
1531 static void
1532 nfs_softterm(struct nfsreq *rep)
1534 rep->r_flags |= R_SOFTTERM;
1536 if (rep->r_flags & R_SENT) {
1537 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1538 rep->r_flags &= ~R_SENT;
1543 * Test for a termination condition pending on the process.
1544 * This is used for NFSMNT_INT mounts.
1547 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
1549 sigset_t tmpset;
1550 struct proc *p;
1551 struct lwp *lp;
1553 if (rep && (rep->r_flags & R_SOFTTERM))
1554 return (EINTR);
1555 /* Terminate all requests while attempting a forced unmount. */
1556 if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
1557 return (EINTR);
1558 if (!(nmp->nm_flag & NFSMNT_INT))
1559 return (0);
1560 /* td might be NULL YYY */
1561 if (td == NULL || (p = td->td_proc) == NULL)
1562 return (0);
1564 lp = td->td_lwp;
1565 tmpset = lwp_sigpend(lp);
1566 SIGSETNAND(tmpset, lp->lwp_sigmask);
1567 SIGSETNAND(tmpset, p->p_sigignore);
1568 if (SIGNOTEMPTY(tmpset) && NFSINT_SIGMASK(tmpset))
1569 return (EINTR);
1571 return (0);
1575 * Lock a socket against others.
1576 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1577 * and also to avoid race conditions between the processes with nfs requests
1578 * in progress when a reconnect is necessary.
1581 nfs_sndlock(struct nfsreq *rep)
1583 int *statep = &rep->r_nmp->nm_state;
1584 struct thread *td;
1585 int slptimeo;
1586 int slpflag;
1587 int error;
1589 slpflag = 0;
1590 slptimeo = 0;
1591 td = rep->r_td;
1592 if (rep->r_nmp->nm_flag & NFSMNT_INT)
1593 slpflag = PCATCH;
1595 error = 0;
1596 crit_enter();
1597 while (*statep & NFSSTA_SNDLOCK) {
1598 *statep |= NFSSTA_WANTSND;
1599 if (nfs_sigintr(rep->r_nmp, rep, td)) {
1600 error = EINTR;
1601 break;
1603 tsleep((caddr_t)statep, slpflag, "nfsndlck", slptimeo);
1604 if (slpflag == PCATCH) {
1605 slpflag = 0;
1606 slptimeo = 2 * hz;
1609 /* Always fail if our request has been cancelled. */
1610 if ((rep->r_flags & R_SOFTTERM))
1611 error = EINTR;
1612 if (error == 0)
1613 *statep |= NFSSTA_SNDLOCK;
1614 crit_exit();
1615 return (error);
1619 * Unlock the stream socket for others.
1621 void
1622 nfs_sndunlock(struct nfsreq *rep)
1624 int *statep = &rep->r_nmp->nm_state;
1626 if ((*statep & NFSSTA_SNDLOCK) == 0)
1627 panic("nfs sndunlock");
1628 crit_enter();
1629 *statep &= ~NFSSTA_SNDLOCK;
1630 if (*statep & NFSSTA_WANTSND) {
1631 *statep &= ~NFSSTA_WANTSND;
1632 wakeup((caddr_t)statep);
1634 crit_exit();
1637 static int
1638 nfs_rcvlock(struct nfsreq *rep)
1640 int *statep = &rep->r_nmp->nm_state;
1641 int slpflag;
1642 int slptimeo;
1643 int error;
1646 * Unconditionally check for completion in case another nfsiod
1647 * get the packet while the caller was blocked, before the caller
1648 * called us. Packet reception is handled by mainline code which
1649 * is protected by the BGL at the moment.
1651 * We do not strictly need the second check just before the
1652 * tsleep(), but it's good defensive programming.
1654 if (rep->r_mrep != NULL)
1655 return (EALREADY);
1657 if (rep->r_nmp->nm_flag & NFSMNT_INT)
1658 slpflag = PCATCH;
1659 else
1660 slpflag = 0;
1661 slptimeo = 0;
1662 error = 0;
1663 crit_enter();
1664 while (*statep & NFSSTA_RCVLOCK) {
1665 if (nfs_sigintr(rep->r_nmp, rep, rep->r_td)) {
1666 error = EINTR;
1667 break;
1669 if (rep->r_mrep != NULL) {
1670 error = EALREADY;
1671 break;
1673 *statep |= NFSSTA_WANTRCV;
1674 tsleep((caddr_t)statep, slpflag, "nfsrcvlk", slptimeo);
1676 * If our reply was recieved while we were sleeping,
1677 * then just return without taking the lock to avoid a
1678 * situation where a single iod could 'capture' the
1679 * recieve lock.
1681 if (rep->r_mrep != NULL) {
1682 error = EALREADY;
1683 break;
1685 if (slpflag == PCATCH) {
1686 slpflag = 0;
1687 slptimeo = 2 * hz;
1690 if (error == 0) {
1691 *statep |= NFSSTA_RCVLOCK;
1692 rep->r_nmp->nm_rcvlock_td = curthread; /* DEBUGGING */
1694 crit_exit();
1695 return (error);
1699 * Unlock the stream socket for others.
1701 static void
1702 nfs_rcvunlock(struct nfsreq *rep)
1704 int *statep = &rep->r_nmp->nm_state;
1706 if ((*statep & NFSSTA_RCVLOCK) == 0)
1707 panic("nfs rcvunlock");
1708 crit_enter();
1709 rep->r_nmp->nm_rcvlock_td = (void *)-1; /* DEBUGGING */
1710 *statep &= ~NFSSTA_RCVLOCK;
1711 if (*statep & NFSSTA_WANTRCV) {
1712 *statep &= ~NFSSTA_WANTRCV;
1713 wakeup((caddr_t)statep);
1715 crit_exit();
1719 * nfs_realign:
1721 * Check for badly aligned mbuf data and realign by copying the unaligned
1722 * portion of the data into a new mbuf chain and freeing the portions
1723 * of the old chain that were replaced.
1725 * We cannot simply realign the data within the existing mbuf chain
1726 * because the underlying buffers may contain other rpc commands and
1727 * we cannot afford to overwrite them.
1729 * We would prefer to avoid this situation entirely. The situation does
1730 * not occur with NFS/UDP and is supposed to only occassionally occur
1731 * with TCP. Use vfs.nfs.realign_count and realign_test to check this.
1733 static void
1734 nfs_realign(struct mbuf **pm, int hsiz)
1736 struct mbuf *m;
1737 struct mbuf *n = NULL;
1738 int off = 0;
1740 ++nfs_realign_test;
1742 while ((m = *pm) != NULL) {
1743 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
1744 n = m_getl(m->m_len, MB_WAIT, MT_DATA, 0, NULL);
1745 n->m_len = 0;
1746 break;
1748 pm = &m->m_next;
1752 * If n is non-NULL, loop on m copying data, then replace the
1753 * portion of the chain that had to be realigned.
1755 if (n != NULL) {
1756 ++nfs_realign_count;
1757 while (m) {
1758 m_copyback(n, off, m->m_len, mtod(m, caddr_t));
1759 off += m->m_len;
1760 m = m->m_next;
1762 m_freem(*pm);
1763 *pm = n;
1767 #ifndef NFS_NOSERVER
1770 * Parse an RPC request
1771 * - verify it
1772 * - fill in the cred struct.
1775 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
1777 int len, i;
1778 u_int32_t *tl;
1779 int32_t t1;
1780 struct uio uio;
1781 struct iovec iov;
1782 caddr_t dpos, cp2, cp;
1783 u_int32_t nfsvers, auth_type;
1784 uid_t nickuid;
1785 int error = 0, ticklen;
1786 struct mbuf *mrep, *md;
1787 struct nfsuid *nuidp;
1788 struct timeval tvin, tvout;
1789 #if 0 /* until encrypted keys are implemented */
1790 NFSKERBKEYSCHED_T keys; /* stores key schedule */
1791 #endif
1793 mrep = nd->nd_mrep;
1794 md = nd->nd_md;
1795 dpos = nd->nd_dpos;
1796 if (has_header) {
1797 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED);
1798 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++);
1799 if (*tl++ != rpc_call) {
1800 m_freem(mrep);
1801 return (EBADRPC);
1803 } else
1804 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED);
1805 nd->nd_repstat = 0;
1806 nd->nd_flag = 0;
1807 if (*tl++ != rpc_vers) {
1808 nd->nd_repstat = ERPCMISMATCH;
1809 nd->nd_procnum = NFSPROC_NOOP;
1810 return (0);
1812 if (*tl != nfs_prog) {
1813 nd->nd_repstat = EPROGUNAVAIL;
1814 nd->nd_procnum = NFSPROC_NOOP;
1815 return (0);
1817 tl++;
1818 nfsvers = fxdr_unsigned(u_int32_t, *tl++);
1819 if (nfsvers < NFS_VER2 || nfsvers > NFS_VER3) {
1820 nd->nd_repstat = EPROGMISMATCH;
1821 nd->nd_procnum = NFSPROC_NOOP;
1822 return (0);
1824 if (nfsvers == NFS_VER3)
1825 nd->nd_flag = ND_NFSV3;
1826 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++);
1827 if (nd->nd_procnum == NFSPROC_NULL)
1828 return (0);
1829 if (nd->nd_procnum >= NFS_NPROCS ||
1830 (nd->nd_procnum >= NQNFSPROC_GETLEASE) ||
1831 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
1832 nd->nd_repstat = EPROCUNAVAIL;
1833 nd->nd_procnum = NFSPROC_NOOP;
1834 return (0);
1836 if ((nd->nd_flag & ND_NFSV3) == 0)
1837 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
1838 auth_type = *tl++;
1839 len = fxdr_unsigned(int, *tl++);
1840 if (len < 0 || len > RPCAUTH_MAXSIZ) {
1841 m_freem(mrep);
1842 return (EBADRPC);
1845 nd->nd_flag &= ~ND_KERBAUTH;
1847 * Handle auth_unix or auth_kerb.
1849 if (auth_type == rpc_auth_unix) {
1850 len = fxdr_unsigned(int, *++tl);
1851 if (len < 0 || len > NFS_MAXNAMLEN) {
1852 m_freem(mrep);
1853 return (EBADRPC);
1855 nfsm_adv(nfsm_rndup(len));
1856 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1857 bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred));
1858 nd->nd_cr.cr_ref = 1;
1859 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
1860 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
1861 len = fxdr_unsigned(int, *tl);
1862 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
1863 m_freem(mrep);
1864 return (EBADRPC);
1866 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED);
1867 for (i = 1; i <= len; i++)
1868 if (i < NGROUPS)
1869 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
1870 else
1871 tl++;
1872 nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
1873 if (nd->nd_cr.cr_ngroups > 1)
1874 nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups);
1875 len = fxdr_unsigned(int, *++tl);
1876 if (len < 0 || len > RPCAUTH_MAXSIZ) {
1877 m_freem(mrep);
1878 return (EBADRPC);
1880 if (len > 0)
1881 nfsm_adv(nfsm_rndup(len));
1882 } else if (auth_type == rpc_auth_kerb) {
1883 switch (fxdr_unsigned(int, *tl++)) {
1884 case RPCAKN_FULLNAME:
1885 ticklen = fxdr_unsigned(int, *tl);
1886 *((u_int32_t *)nfsd->nfsd_authstr) = *tl;
1887 uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED;
1888 nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED;
1889 if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) {
1890 m_freem(mrep);
1891 return (EBADRPC);
1893 uio.uio_offset = 0;
1894 uio.uio_iov = &iov;
1895 uio.uio_iovcnt = 1;
1896 uio.uio_segflg = UIO_SYSSPACE;
1897 iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4];
1898 iov.iov_len = RPCAUTH_MAXSIZ - 4;
1899 nfsm_mtouio(&uio, uio.uio_resid);
1900 nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1901 if (*tl++ != rpc_auth_kerb ||
1902 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
1903 kprintf("Bad kerb verifier\n");
1904 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1905 nd->nd_procnum = NFSPROC_NOOP;
1906 return (0);
1908 nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
1909 tl = (u_int32_t *)cp;
1910 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
1911 kprintf("Not fullname kerb verifier\n");
1912 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1913 nd->nd_procnum = NFSPROC_NOOP;
1914 return (0);
1916 cp += NFSX_UNSIGNED;
1917 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
1918 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
1919 nd->nd_flag |= ND_KERBFULL;
1920 nfsd->nfsd_flag |= NFSD_NEEDAUTH;
1921 break;
1922 case RPCAKN_NICKNAME:
1923 if (len != 2 * NFSX_UNSIGNED) {
1924 kprintf("Kerb nickname short\n");
1925 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
1926 nd->nd_procnum = NFSPROC_NOOP;
1927 return (0);
1929 nickuid = fxdr_unsigned(uid_t, *tl);
1930 nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1931 if (*tl++ != rpc_auth_kerb ||
1932 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
1933 kprintf("Kerb nick verifier bad\n");
1934 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1935 nd->nd_procnum = NFSPROC_NOOP;
1936 return (0);
1938 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1939 tvin.tv_sec = *tl++;
1940 tvin.tv_usec = *tl;
1942 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
1943 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1944 if (nuidp->nu_cr.cr_uid == nickuid &&
1945 (!nd->nd_nam2 ||
1946 netaddr_match(NU_NETFAM(nuidp),
1947 &nuidp->nu_haddr, nd->nd_nam2)))
1948 break;
1950 if (!nuidp) {
1951 nd->nd_repstat =
1952 (NFSERR_AUTHERR|AUTH_REJECTCRED);
1953 nd->nd_procnum = NFSPROC_NOOP;
1954 return (0);
1958 * Now, decrypt the timestamp using the session key
1959 * and validate it.
1961 #ifdef NFSKERB
1963 #endif
1965 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
1966 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
1967 if (nuidp->nu_expire < time_second ||
1968 nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
1969 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
1970 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
1971 nuidp->nu_expire = 0;
1972 nd->nd_repstat =
1973 (NFSERR_AUTHERR|AUTH_REJECTVERF);
1974 nd->nd_procnum = NFSPROC_NOOP;
1975 return (0);
1977 nfsrv_setcred(&nuidp->nu_cr, &nd->nd_cr);
1978 nd->nd_flag |= ND_KERBNICK;
1980 } else {
1981 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
1982 nd->nd_procnum = NFSPROC_NOOP;
1983 return (0);
1986 nd->nd_md = md;
1987 nd->nd_dpos = dpos;
1988 return (0);
1989 nfsmout:
1990 return (error);
1993 #endif
1996 * Send a message to the originating process's terminal. The thread and/or
1997 * process may be NULL. YYY the thread should not be NULL but there may
1998 * still be some uio_td's that are still being passed as NULL through to
1999 * nfsm_request().
2001 static int
2002 nfs_msg(struct thread *td, char *server, char *msg)
2004 tpr_t tpr;
2006 if (td && td->td_proc)
2007 tpr = tprintf_open(td->td_proc);
2008 else
2009 tpr = NULL;
2010 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2011 tprintf_close(tpr);
2012 return (0);
2015 #ifndef NFS_NOSERVER
2017 * Socket upcall routine for the nfsd sockets.
2018 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2019 * Essentially do as much as possible non-blocking, else punt and it will
2020 * be called with MB_WAIT from an nfsd.
2022 void
2023 nfsrv_rcv(struct socket *so, void *arg, int waitflag)
2025 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2026 struct mbuf *m;
2027 struct mbuf *mp;
2028 struct sockaddr *nam;
2029 struct uio auio;
2030 int flags, error;
2031 int nparallel_wakeup = 0;
2033 if ((slp->ns_flag & SLP_VALID) == 0)
2034 return;
2037 * Do not allow an infinite number of completed RPC records to build
2038 * up before we stop reading data from the socket. Otherwise we could
2039 * end up holding onto an unreasonable number of mbufs for requests
2040 * waiting for service.
2042 * This should give pretty good feedback to the TCP
2043 * layer and prevents a memory crunch for other protocols.
2045 * Note that the same service socket can be dispatched to several
2046 * nfs servers simultaniously.
2048 * the tcp protocol callback calls us with MB_DONTWAIT.
2049 * nfsd calls us with MB_WAIT (typically).
2051 if (waitflag == MB_DONTWAIT && slp->ns_numrec >= nfsd_waiting / 2 + 1) {
2052 slp->ns_flag |= SLP_NEEDQ;
2053 goto dorecs;
2057 * Handle protocol specifics to parse an RPC request. We always
2058 * pull from the socket using non-blocking I/O.
2060 auio.uio_td = NULL;
2061 if (so->so_type == SOCK_STREAM) {
2063 * The data has to be read in an orderly fashion from a TCP
2064 * stream, unlike a UDP socket. It is possible for soreceive
2065 * and/or nfsrv_getstream() to block, so make sure only one
2066 * entity is messing around with the TCP stream at any given
2067 * moment. The receive sockbuf's lock in soreceive is not
2068 * sufficient.
2070 * Note that this procedure can be called from any number of
2071 * NFS severs *OR* can be upcalled directly from a TCP
2072 * protocol thread.
2074 if (slp->ns_flag & SLP_GETSTREAM) {
2075 slp->ns_flag |= SLP_NEEDQ;
2076 goto dorecs;
2078 slp->ns_flag |= SLP_GETSTREAM;
2081 * Do soreceive().
2083 auio.uio_resid = 1000000000;
2084 flags = MSG_DONTWAIT;
2085 error = so_pru_soreceive(so, &nam, &auio, &mp, NULL, &flags);
2086 if (error || mp == (struct mbuf *)0) {
2087 if (error == EWOULDBLOCK)
2088 slp->ns_flag |= SLP_NEEDQ;
2089 else
2090 slp->ns_flag |= SLP_DISCONN;
2091 slp->ns_flag &= ~SLP_GETSTREAM;
2092 goto dorecs;
2094 m = mp;
2095 if (slp->ns_rawend) {
2096 slp->ns_rawend->m_next = m;
2097 slp->ns_cc += 1000000000 - auio.uio_resid;
2098 } else {
2099 slp->ns_raw = m;
2100 slp->ns_cc = 1000000000 - auio.uio_resid;
2102 while (m->m_next)
2103 m = m->m_next;
2104 slp->ns_rawend = m;
2107 * Now try and parse as many record(s) as we can out of the
2108 * raw stream data.
2110 error = nfsrv_getstream(slp, waitflag, &nparallel_wakeup);
2111 if (error) {
2112 if (error == EPERM)
2113 slp->ns_flag |= SLP_DISCONN;
2114 else
2115 slp->ns_flag |= SLP_NEEDQ;
2117 slp->ns_flag &= ~SLP_GETSTREAM;
2118 } else {
2120 * For UDP soreceive typically pulls just one packet, loop
2121 * to get the whole batch.
2123 do {
2124 auio.uio_resid = 1000000000;
2125 flags = MSG_DONTWAIT;
2126 error = so_pru_soreceive(so, &nam, &auio, &mp, NULL,
2127 &flags);
2128 if (mp) {
2129 struct nfsrv_rec *rec;
2130 int mf = (waitflag & MB_DONTWAIT) ?
2131 M_NOWAIT : M_WAITOK;
2132 rec = kmalloc(sizeof(struct nfsrv_rec),
2133 M_NFSRVDESC, mf);
2134 if (!rec) {
2135 if (nam)
2136 FREE(nam, M_SONAME);
2137 m_freem(mp);
2138 continue;
2140 nfs_realign(&mp, 10 * NFSX_UNSIGNED);
2141 rec->nr_address = nam;
2142 rec->nr_packet = mp;
2143 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2144 ++slp->ns_numrec;
2145 ++nparallel_wakeup;
2147 if (error) {
2148 if ((so->so_proto->pr_flags & PR_CONNREQUIRED)
2149 && error != EWOULDBLOCK) {
2150 slp->ns_flag |= SLP_DISCONN;
2151 goto dorecs;
2154 } while (mp);
2158 * If we were upcalled from the tcp protocol layer and we have
2159 * fully parsed records ready to go, or there is new data pending,
2160 * or something went wrong, try to wake up an nfsd thread to deal
2161 * with it.
2163 dorecs:
2164 if (waitflag == MB_DONTWAIT && (slp->ns_numrec > 0
2165 || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) {
2166 nfsrv_wakenfsd(slp, nparallel_wakeup);
2171 * Try and extract an RPC request from the mbuf data list received on a
2172 * stream socket. The "waitflag" argument indicates whether or not it
2173 * can sleep.
2175 static int
2176 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag, int *countp)
2178 struct mbuf *m, **mpp;
2179 char *cp1, *cp2;
2180 int len;
2181 struct mbuf *om, *m2, *recm;
2182 u_int32_t recmark;
2184 for (;;) {
2185 if (slp->ns_reclen == 0) {
2186 if (slp->ns_cc < NFSX_UNSIGNED)
2187 return (0);
2188 m = slp->ns_raw;
2189 if (m->m_len >= NFSX_UNSIGNED) {
2190 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED);
2191 m->m_data += NFSX_UNSIGNED;
2192 m->m_len -= NFSX_UNSIGNED;
2193 } else {
2194 cp1 = (caddr_t)&recmark;
2195 cp2 = mtod(m, caddr_t);
2196 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2197 while (m->m_len == 0) {
2198 m = m->m_next;
2199 cp2 = mtod(m, caddr_t);
2201 *cp1++ = *cp2++;
2202 m->m_data++;
2203 m->m_len--;
2206 slp->ns_cc -= NFSX_UNSIGNED;
2207 recmark = ntohl(recmark);
2208 slp->ns_reclen = recmark & ~0x80000000;
2209 if (recmark & 0x80000000)
2210 slp->ns_flag |= SLP_LASTFRAG;
2211 else
2212 slp->ns_flag &= ~SLP_LASTFRAG;
2213 if (slp->ns_reclen > NFS_MAXPACKET || slp->ns_reclen <= 0) {
2214 log(LOG_ERR, "%s (%d) from nfs client\n",
2215 "impossible packet length",
2216 slp->ns_reclen);
2217 return (EPERM);
2222 * Now get the record part.
2224 * Note that slp->ns_reclen may be 0. Linux sometimes
2225 * generates 0-length RPCs
2227 recm = NULL;
2228 if (slp->ns_cc == slp->ns_reclen) {
2229 recm = slp->ns_raw;
2230 slp->ns_raw = slp->ns_rawend = (struct mbuf *)0;
2231 slp->ns_cc = slp->ns_reclen = 0;
2232 } else if (slp->ns_cc > slp->ns_reclen) {
2233 len = 0;
2234 m = slp->ns_raw;
2235 om = (struct mbuf *)0;
2237 while (len < slp->ns_reclen) {
2238 if ((len + m->m_len) > slp->ns_reclen) {
2239 m2 = m_copym(m, 0, slp->ns_reclen - len,
2240 waitflag);
2241 if (m2) {
2242 if (om) {
2243 om->m_next = m2;
2244 recm = slp->ns_raw;
2245 } else
2246 recm = m2;
2247 m->m_data += slp->ns_reclen - len;
2248 m->m_len -= slp->ns_reclen - len;
2249 len = slp->ns_reclen;
2250 } else {
2251 return (EWOULDBLOCK);
2253 } else if ((len + m->m_len) == slp->ns_reclen) {
2254 om = m;
2255 len += m->m_len;
2256 m = m->m_next;
2257 recm = slp->ns_raw;
2258 om->m_next = (struct mbuf *)0;
2259 } else {
2260 om = m;
2261 len += m->m_len;
2262 m = m->m_next;
2265 slp->ns_raw = m;
2266 slp->ns_cc -= len;
2267 slp->ns_reclen = 0;
2268 } else {
2269 return (0);
2273 * Accumulate the fragments into a record.
2275 mpp = &slp->ns_frag;
2276 while (*mpp)
2277 mpp = &((*mpp)->m_next);
2278 *mpp = recm;
2279 if (slp->ns_flag & SLP_LASTFRAG) {
2280 struct nfsrv_rec *rec;
2281 int mf = (waitflag & MB_DONTWAIT) ? M_NOWAIT : M_WAITOK;
2282 rec = kmalloc(sizeof(struct nfsrv_rec), M_NFSRVDESC, mf);
2283 if (!rec) {
2284 m_freem(slp->ns_frag);
2285 } else {
2286 nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
2287 rec->nr_address = (struct sockaddr *)0;
2288 rec->nr_packet = slp->ns_frag;
2289 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2290 ++slp->ns_numrec;
2291 ++*countp;
2293 slp->ns_frag = (struct mbuf *)0;
2299 * Parse an RPC header.
2302 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd,
2303 struct nfsrv_descript **ndp)
2305 struct nfsrv_rec *rec;
2306 struct mbuf *m;
2307 struct sockaddr *nam;
2308 struct nfsrv_descript *nd;
2309 int error;
2311 *ndp = NULL;
2312 if ((slp->ns_flag & SLP_VALID) == 0 || !STAILQ_FIRST(&slp->ns_rec))
2313 return (ENOBUFS);
2314 rec = STAILQ_FIRST(&slp->ns_rec);
2315 STAILQ_REMOVE_HEAD(&slp->ns_rec, nr_link);
2316 KKASSERT(slp->ns_numrec > 0);
2317 --slp->ns_numrec;
2318 nam = rec->nr_address;
2319 m = rec->nr_packet;
2320 kfree(rec, M_NFSRVDESC);
2321 MALLOC(nd, struct nfsrv_descript *, sizeof (struct nfsrv_descript),
2322 M_NFSRVDESC, M_WAITOK);
2323 nd->nd_md = nd->nd_mrep = m;
2324 nd->nd_nam2 = nam;
2325 nd->nd_dpos = mtod(m, caddr_t);
2326 error = nfs_getreq(nd, nfsd, TRUE);
2327 if (error) {
2328 if (nam) {
2329 FREE(nam, M_SONAME);
2331 kfree((caddr_t)nd, M_NFSRVDESC);
2332 return (error);
2334 *ndp = nd;
2335 nfsd->nfsd_nd = nd;
2336 return (0);
2340 * Try to assign service sockets to nfsd threads based on the number
2341 * of new rpc requests that have been queued on the service socket.
2343 * If no nfsd's are available or additonal requests are pending, set the
2344 * NFSD_CHECKSLP flag so that one of the running nfsds will go look for
2345 * the work in the nfssvc_sock list when it is finished processing its
2346 * current work. This flag is only cleared when an nfsd can not find
2347 * any new work to perform.
2349 void
2350 nfsrv_wakenfsd(struct nfssvc_sock *slp, int nparallel)
2352 struct nfsd *nd;
2354 if ((slp->ns_flag & SLP_VALID) == 0)
2355 return;
2356 if (nparallel <= 1)
2357 nparallel = 1;
2358 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2359 if (nd->nfsd_flag & NFSD_WAITING) {
2360 nd->nfsd_flag &= ~NFSD_WAITING;
2361 if (nd->nfsd_slp)
2362 panic("nfsd wakeup");
2363 slp->ns_sref++;
2364 nd->nfsd_slp = slp;
2365 wakeup((caddr_t)nd);
2366 if (--nparallel == 0)
2367 break;
2370 if (nparallel) {
2371 slp->ns_flag |= SLP_DOREC;
2372 nfsd_head_flag |= NFSD_CHECKSLP;
2375 #endif /* NFS_NOSERVER */