Get rid of struct user/UAREA.
[dragonfly.git] / sys / vfs / nfs / nfs_socket.c
blob68ce06188dd9cac1428188701f8ca0707d24d2f4
1 /*
2 * Copyright (c) 1989, 1991, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
36 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
37 * $FreeBSD: src/sys/nfs/nfs_socket.c,v 1.60.2.6 2003/03/26 01:44:46 alfred Exp $
38 * $DragonFly: src/sys/vfs/nfs/nfs_socket.c,v 1.42 2007/02/25 23:17:13 corecode Exp $
42 * Socket operations for use by nfs
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/proc.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
51 #include <sys/mbuf.h>
52 #include <sys/vnode.h>
53 #include <sys/fcntl.h>
54 #include <sys/protosw.h>
55 #include <sys/resourcevar.h>
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/socketops.h>
59 #include <sys/syslog.h>
60 #include <sys/thread.h>
61 #include <sys/tprintf.h>
62 #include <sys/sysctl.h>
63 #include <sys/signalvar.h>
64 #include <sys/signal2.h>
66 #include <netinet/in.h>
67 #include <netinet/tcp.h>
68 #include <sys/thread2.h>
70 #include "rpcv2.h"
71 #include "nfsproto.h"
72 #include "nfs.h"
73 #include "xdr_subs.h"
74 #include "nfsm_subs.h"
75 #include "nfsmount.h"
76 #include "nfsnode.h"
77 #include "nfsrtt.h"
79 #define TRUE 1
80 #define FALSE 0
83 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
84 * Use the mean and mean deviation of rtt for the appropriate type of rpc
85 * for the frequent rpcs and a default for the others.
86 * The justification for doing "other" this way is that these rpcs
87 * happen so infrequently that timer est. would probably be stale.
88 * Also, since many of these rpcs are
89 * non-idempotent, a conservative timeout is desired.
90 * getattr, lookup - A+2D
91 * read, write - A+4D
92 * other - nm_timeo
94 #define NFS_RTO(n, t) \
95 ((t) == 0 ? (n)->nm_timeo : \
96 ((t) < 3 ? \
97 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
98 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
99 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
100 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
102 * External data, mostly RPC constants in XDR form
104 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers,
105 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr,
106 rpc_auth_kerb;
107 extern u_int32_t nfs_prog;
108 extern struct nfsstats nfsstats;
109 extern int nfsv3_procid[NFS_NPROCS];
110 extern int nfs_ticks;
113 * Defines which timer to use for the procnum.
114 * 0 - default
115 * 1 - getattr
116 * 2 - lookup
117 * 3 - read
118 * 4 - write
120 static int proct[NFS_NPROCS] = {
121 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
122 0, 0, 0,
125 static int nfs_realign_test;
126 static int nfs_realign_count;
127 static int nfs_bufpackets = 4;
128 static int nfs_timer_raced;
130 SYSCTL_DECL(_vfs_nfs);
132 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
133 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
134 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
138 * There is a congestion window for outstanding rpcs maintained per mount
139 * point. The cwnd size is adjusted in roughly the way that:
140 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
141 * SIGCOMM '88". ACM, August 1988.
142 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
143 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
144 * of rpcs is in progress.
145 * (The sent count and cwnd are scaled for integer arith.)
146 * Variants of "slow start" were tried and were found to be too much of a
147 * performance hit (ave. rtt 3 times larger),
148 * I suspect due to the large rtt that nfs rpcs have.
150 #define NFS_CWNDSCALE 256
151 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
152 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
153 int nfsrtton = 0;
154 struct nfsrtt nfsrtt;
155 struct callout nfs_timer_handle;
157 static int nfs_msg (struct thread *,char *,char *);
158 static int nfs_rcvlock (struct nfsreq *);
159 static void nfs_rcvunlock (struct nfsreq *);
160 static void nfs_realign (struct mbuf **pm, int hsiz);
161 static int nfs_receive (struct nfsreq *rep, struct sockaddr **aname,
162 struct mbuf **mp);
163 static void nfs_softterm (struct nfsreq *rep);
164 static int nfs_reconnect (struct nfsreq *rep);
165 #ifndef NFS_NOSERVER
166 static int nfsrv_getstream (struct nfssvc_sock *, int, int *);
168 int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd,
169 struct nfssvc_sock *slp,
170 struct thread *td,
171 struct mbuf **mreqp) = {
172 nfsrv_null,
173 nfsrv_getattr,
174 nfsrv_setattr,
175 nfsrv_lookup,
176 nfsrv3_access,
177 nfsrv_readlink,
178 nfsrv_read,
179 nfsrv_write,
180 nfsrv_create,
181 nfsrv_mkdir,
182 nfsrv_symlink,
183 nfsrv_mknod,
184 nfsrv_remove,
185 nfsrv_rmdir,
186 nfsrv_rename,
187 nfsrv_link,
188 nfsrv_readdir,
189 nfsrv_readdirplus,
190 nfsrv_statfs,
191 nfsrv_fsinfo,
192 nfsrv_pathconf,
193 nfsrv_commit,
194 nfsrv_noop,
195 nfsrv_noop,
196 nfsrv_noop,
197 nfsrv_noop
199 #endif /* NFS_NOSERVER */
202 * Initialize sockets and congestion for a new NFS connection.
203 * We do not free the sockaddr if error.
206 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
208 struct socket *so;
209 int error, rcvreserve, sndreserve;
210 int pktscale;
211 struct sockaddr *saddr;
212 struct sockaddr_in *sin;
213 struct thread *td = &thread0; /* only used for socreate and sobind */
215 nmp->nm_so = (struct socket *)0;
216 saddr = nmp->nm_nam;
217 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
218 nmp->nm_soproto, td);
219 if (error)
220 goto bad;
221 so = nmp->nm_so;
222 nmp->nm_soflags = so->so_proto->pr_flags;
225 * Some servers require that the client port be a reserved port number.
227 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
228 struct sockopt sopt;
229 int ip;
230 struct sockaddr_in ssin;
232 bzero(&sopt, sizeof sopt);
233 ip = IP_PORTRANGE_LOW;
234 sopt.sopt_level = IPPROTO_IP;
235 sopt.sopt_name = IP_PORTRANGE;
236 sopt.sopt_val = (void *)&ip;
237 sopt.sopt_valsize = sizeof(ip);
238 sopt.sopt_td = NULL;
239 error = sosetopt(so, &sopt);
240 if (error)
241 goto bad;
242 bzero(&ssin, sizeof ssin);
243 sin = &ssin;
244 sin->sin_len = sizeof (struct sockaddr_in);
245 sin->sin_family = AF_INET;
246 sin->sin_addr.s_addr = INADDR_ANY;
247 sin->sin_port = htons(0);
248 error = sobind(so, (struct sockaddr *)sin, td);
249 if (error)
250 goto bad;
251 bzero(&sopt, sizeof sopt);
252 ip = IP_PORTRANGE_DEFAULT;
253 sopt.sopt_level = IPPROTO_IP;
254 sopt.sopt_name = IP_PORTRANGE;
255 sopt.sopt_val = (void *)&ip;
256 sopt.sopt_valsize = sizeof(ip);
257 sopt.sopt_td = NULL;
258 error = sosetopt(so, &sopt);
259 if (error)
260 goto bad;
264 * Protocols that do not require connections may be optionally left
265 * unconnected for servers that reply from a port other than NFS_PORT.
267 if (nmp->nm_flag & NFSMNT_NOCONN) {
268 if (nmp->nm_soflags & PR_CONNREQUIRED) {
269 error = ENOTCONN;
270 goto bad;
272 } else {
273 error = soconnect(so, nmp->nm_nam, td);
274 if (error)
275 goto bad;
278 * Wait for the connection to complete. Cribbed from the
279 * connect system call but with the wait timing out so
280 * that interruptible mounts don't hang here for a long time.
282 crit_enter();
283 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
284 (void) tsleep((caddr_t)&so->so_timeo, 0,
285 "nfscon", 2 * hz);
286 if ((so->so_state & SS_ISCONNECTING) &&
287 so->so_error == 0 && rep &&
288 (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0){
289 so->so_state &= ~SS_ISCONNECTING;
290 crit_exit();
291 goto bad;
294 if (so->so_error) {
295 error = so->so_error;
296 so->so_error = 0;
297 crit_exit();
298 goto bad;
300 crit_exit();
302 so->so_rcv.sb_timeo = (5 * hz);
303 so->so_snd.sb_timeo = (5 * hz);
306 * Get buffer reservation size from sysctl, but impose reasonable
307 * limits.
309 pktscale = nfs_bufpackets;
310 if (pktscale < 2)
311 pktscale = 2;
312 if (pktscale > 64)
313 pktscale = 64;
315 if (nmp->nm_sotype == SOCK_DGRAM) {
316 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
317 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
318 NFS_MAXPKTHDR) * pktscale;
319 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
320 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
321 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
322 NFS_MAXPKTHDR) * pktscale;
323 } else {
324 if (nmp->nm_sotype != SOCK_STREAM)
325 panic("nfscon sotype");
326 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
327 struct sockopt sopt;
328 int val;
330 bzero(&sopt, sizeof sopt);
331 sopt.sopt_level = SOL_SOCKET;
332 sopt.sopt_name = SO_KEEPALIVE;
333 sopt.sopt_val = &val;
334 sopt.sopt_valsize = sizeof val;
335 val = 1;
336 sosetopt(so, &sopt);
338 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
339 struct sockopt sopt;
340 int val;
342 bzero(&sopt, sizeof sopt);
343 sopt.sopt_level = IPPROTO_TCP;
344 sopt.sopt_name = TCP_NODELAY;
345 sopt.sopt_val = &val;
346 sopt.sopt_valsize = sizeof val;
347 val = 1;
348 sosetopt(so, &sopt);
350 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
351 sizeof (u_int32_t)) * pktscale;
352 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
353 sizeof (u_int32_t)) * pktscale;
355 error = soreserve(so, sndreserve, rcvreserve,
356 &td->td_proc->p_rlimit[RLIMIT_SBSIZE]);
357 if (error)
358 goto bad;
359 so->so_rcv.sb_flags |= SB_NOINTR;
360 so->so_snd.sb_flags |= SB_NOINTR;
362 /* Initialize other non-zero congestion variables */
363 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
364 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
365 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
366 nmp->nm_sdrtt[3] = 0;
367 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
368 nmp->nm_sent = 0;
369 nmp->nm_timeouts = 0;
370 return (0);
372 bad:
373 nfs_disconnect(nmp);
374 return (error);
378 * Reconnect routine:
379 * Called when a connection is broken on a reliable protocol.
380 * - clean up the old socket
381 * - nfs_connect() again
382 * - set R_MUSTRESEND for all outstanding requests on mount point
383 * If this fails the mount point is DEAD!
384 * nb: Must be called with the nfs_sndlock() set on the mount point.
386 static int
387 nfs_reconnect(struct nfsreq *rep)
389 struct nfsreq *rp;
390 struct nfsmount *nmp = rep->r_nmp;
391 int error;
393 nfs_disconnect(nmp);
394 while ((error = nfs_connect(nmp, rep)) != 0) {
395 if (error == EINTR || error == ERESTART)
396 return (EINTR);
397 (void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0);
401 * Loop through outstanding request list and fix up all requests
402 * on old socket.
404 crit_enter();
405 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
406 if (rp->r_nmp == nmp)
407 rp->r_flags |= R_MUSTRESEND;
409 crit_exit();
410 return (0);
414 * NFS disconnect. Clean up and unlink.
416 void
417 nfs_disconnect(struct nfsmount *nmp)
419 struct socket *so;
421 if (nmp->nm_so) {
422 so = nmp->nm_so;
423 nmp->nm_so = (struct socket *)0;
424 soshutdown(so, 2);
425 soclose(so, FNONBLOCK);
429 void
430 nfs_safedisconnect(struct nfsmount *nmp)
432 struct nfsreq dummyreq;
434 bzero(&dummyreq, sizeof(dummyreq));
435 dummyreq.r_nmp = nmp;
436 dummyreq.r_td = NULL;
437 nfs_rcvlock(&dummyreq);
438 nfs_disconnect(nmp);
439 nfs_rcvunlock(&dummyreq);
443 * This is the nfs send routine. For connection based socket types, it
444 * must be called with an nfs_sndlock() on the socket.
445 * "rep == NULL" indicates that it has been called from a server.
446 * For the client side:
447 * - return EINTR if the RPC is terminated, 0 otherwise
448 * - set R_MUSTRESEND if the send fails for any reason
449 * - do any cleanup required by recoverable socket errors (?)
450 * For the server side:
451 * - return EINTR or ERESTART if interrupted by a signal
452 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
453 * - do any cleanup required by recoverable socket errors (?)
456 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
457 struct nfsreq *rep)
459 struct sockaddr *sendnam;
460 int error, soflags, flags;
462 if (rep) {
463 if (rep->r_flags & R_SOFTTERM) {
464 m_freem(top);
465 return (EINTR);
467 if ((so = rep->r_nmp->nm_so) == NULL) {
468 rep->r_flags |= R_MUSTRESEND;
469 m_freem(top);
470 return (0);
472 rep->r_flags &= ~R_MUSTRESEND;
473 soflags = rep->r_nmp->nm_soflags;
474 } else
475 soflags = so->so_proto->pr_flags;
476 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
477 sendnam = (struct sockaddr *)0;
478 else
479 sendnam = nam;
480 if (so->so_type == SOCK_SEQPACKET)
481 flags = MSG_EOR;
482 else
483 flags = 0;
485 error = so_pru_sosend(so, sendnam, NULL, top, NULL, flags,
486 curthread /*XXX*/);
488 * ENOBUFS for dgram sockets is transient and non fatal.
489 * No need to log, and no need to break a soft mount.
491 if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
492 error = 0;
493 if (rep) /* do backoff retransmit on client */
494 rep->r_flags |= R_MUSTRESEND;
497 if (error) {
498 if (rep) {
499 log(LOG_INFO, "nfs send error %d for server %s\n",error,
500 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
502 * Deal with errors for the client side.
504 if (rep->r_flags & R_SOFTTERM)
505 error = EINTR;
506 else
507 rep->r_flags |= R_MUSTRESEND;
508 } else
509 log(LOG_INFO, "nfsd send error %d\n", error);
512 * Handle any recoverable (soft) socket errors here. (?)
514 if (error != EINTR && error != ERESTART &&
515 error != EWOULDBLOCK && error != EPIPE)
516 error = 0;
518 return (error);
522 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
523 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
524 * Mark and consolidate the data into a new mbuf list.
525 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
526 * small mbufs.
527 * For SOCK_STREAM we must be very careful to read an entire record once
528 * we have read any of it, even if the system call has been interrupted.
530 static int
531 nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
533 struct socket *so;
534 struct uio auio;
535 struct iovec aio;
536 struct mbuf *m;
537 struct mbuf *control;
538 u_int32_t len;
539 struct sockaddr **getnam;
540 int error, sotype, rcvflg;
541 struct thread *td = curthread; /* XXX */
544 * Set up arguments for soreceive()
546 *mp = (struct mbuf *)0;
547 *aname = (struct sockaddr *)0;
548 sotype = rep->r_nmp->nm_sotype;
551 * For reliable protocols, lock against other senders/receivers
552 * in case a reconnect is necessary.
553 * For SOCK_STREAM, first get the Record Mark to find out how much
554 * more there is to get.
555 * We must lock the socket against other receivers
556 * until we have an entire rpc request/reply.
558 if (sotype != SOCK_DGRAM) {
559 error = nfs_sndlock(rep);
560 if (error)
561 return (error);
562 tryagain:
564 * Check for fatal errors and resending request.
567 * Ugh: If a reconnect attempt just happened, nm_so
568 * would have changed. NULL indicates a failed
569 * attempt that has essentially shut down this
570 * mount point.
572 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
573 nfs_sndunlock(rep);
574 return (EINTR);
576 so = rep->r_nmp->nm_so;
577 if (!so) {
578 error = nfs_reconnect(rep);
579 if (error) {
580 nfs_sndunlock(rep);
581 return (error);
583 goto tryagain;
585 while (rep->r_flags & R_MUSTRESEND) {
586 m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
587 nfsstats.rpcretries++;
588 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
589 if (error) {
590 if (error == EINTR || error == ERESTART ||
591 (error = nfs_reconnect(rep)) != 0) {
592 nfs_sndunlock(rep);
593 return (error);
595 goto tryagain;
598 nfs_sndunlock(rep);
599 if (sotype == SOCK_STREAM) {
600 aio.iov_base = (caddr_t) &len;
601 aio.iov_len = sizeof(u_int32_t);
602 auio.uio_iov = &aio;
603 auio.uio_iovcnt = 1;
604 auio.uio_segflg = UIO_SYSSPACE;
605 auio.uio_rw = UIO_READ;
606 auio.uio_offset = 0;
607 auio.uio_resid = sizeof(u_int32_t);
608 auio.uio_td = td;
609 do {
610 rcvflg = MSG_WAITALL;
611 error = so_pru_soreceive(so, NULL, &auio, NULL,
612 NULL, &rcvflg);
613 if (error == EWOULDBLOCK && rep) {
614 if (rep->r_flags & R_SOFTTERM)
615 return (EINTR);
617 } while (error == EWOULDBLOCK);
618 if (!error && auio.uio_resid > 0) {
620 * Don't log a 0 byte receive; it means
621 * that the socket has been closed, and
622 * can happen during normal operation
623 * (forcible unmount or Solaris server).
625 if (auio.uio_resid != sizeof (u_int32_t))
626 log(LOG_INFO,
627 "short receive (%d/%d) from nfs server %s\n",
628 (int)(sizeof(u_int32_t) - auio.uio_resid),
629 (int)sizeof(u_int32_t),
630 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
631 error = EPIPE;
633 if (error)
634 goto errout;
635 len = ntohl(len) & ~0x80000000;
637 * This is SERIOUS! We are out of sync with the sender
638 * and forcing a disconnect/reconnect is all I can do.
640 if (len > NFS_MAXPACKET) {
641 log(LOG_ERR, "%s (%d) from nfs server %s\n",
642 "impossible packet length",
643 len,
644 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
645 error = EFBIG;
646 goto errout;
648 auio.uio_resid = len;
649 do {
650 rcvflg = MSG_WAITALL;
651 error = so_pru_soreceive(so, NULL, &auio, mp,
652 NULL, &rcvflg);
653 } while (error == EWOULDBLOCK || error == EINTR ||
654 error == ERESTART);
655 if (!error && auio.uio_resid > 0) {
656 if (len != auio.uio_resid)
657 log(LOG_INFO,
658 "short receive (%d/%d) from nfs server %s\n",
659 len - auio.uio_resid, len,
660 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
661 error = EPIPE;
663 } else {
665 * NB: Since uio_resid is big, MSG_WAITALL is ignored
666 * and soreceive() will return when it has either a
667 * control msg or a data msg.
668 * We have no use for control msg., but must grab them
669 * and then throw them away so we know what is going
670 * on.
672 auio.uio_resid = len = 100000000; /* Anything Big */
673 auio.uio_td = td;
674 do {
675 rcvflg = 0;
676 error = so_pru_soreceive(so, NULL, &auio, mp,
677 &control, &rcvflg);
678 if (control)
679 m_freem(control);
680 if (error == EWOULDBLOCK && rep) {
681 if (rep->r_flags & R_SOFTTERM)
682 return (EINTR);
684 } while (error == EWOULDBLOCK ||
685 (!error && *mp == NULL && control));
686 if ((rcvflg & MSG_EOR) == 0)
687 kprintf("Egad!!\n");
688 if (!error && *mp == NULL)
689 error = EPIPE;
690 len -= auio.uio_resid;
692 errout:
693 if (error && error != EINTR && error != ERESTART) {
694 m_freem(*mp);
695 *mp = (struct mbuf *)0;
696 if (error != EPIPE)
697 log(LOG_INFO,
698 "receive error %d from nfs server %s\n",
699 error,
700 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
701 error = nfs_sndlock(rep);
702 if (!error) {
703 error = nfs_reconnect(rep);
704 if (!error)
705 goto tryagain;
706 else
707 nfs_sndunlock(rep);
710 } else {
711 if ((so = rep->r_nmp->nm_so) == NULL)
712 return (EACCES);
713 if (so->so_state & SS_ISCONNECTED)
714 getnam = (struct sockaddr **)0;
715 else
716 getnam = aname;
717 auio.uio_resid = len = 1000000;
718 auio.uio_td = td;
719 do {
720 rcvflg = 0;
721 error = so_pru_soreceive(so, getnam, &auio, mp, NULL,
722 &rcvflg);
723 if (error == EWOULDBLOCK &&
724 (rep->r_flags & R_SOFTTERM))
725 return (EINTR);
726 } while (error == EWOULDBLOCK);
727 len -= auio.uio_resid;
729 if (error) {
730 m_freem(*mp);
731 *mp = (struct mbuf *)0;
734 * Search for any mbufs that are not a multiple of 4 bytes long
735 * or with m_data not longword aligned.
736 * These could cause pointer alignment problems, so copy them to
737 * well aligned mbufs.
739 nfs_realign(mp, 5 * NFSX_UNSIGNED);
740 return (error);
744 * Implement receipt of reply on a socket.
745 * We must search through the list of received datagrams matching them
746 * with outstanding requests using the xid, until ours is found.
748 /* ARGSUSED */
750 nfs_reply(struct nfsreq *myrep)
752 struct nfsreq *rep;
753 struct nfsmount *nmp = myrep->r_nmp;
754 int32_t t1;
755 struct mbuf *mrep, *md;
756 struct sockaddr *nam;
757 u_int32_t rxid, *tl;
758 caddr_t dpos, cp2;
759 int error;
762 * Loop around until we get our own reply
764 for (;;) {
766 * Lock against other receivers so that I don't get stuck in
767 * sbwait() after someone else has received my reply for me.
768 * Also necessary for connection based protocols to avoid
769 * race conditions during a reconnect.
770 * If nfs_rcvlock() returns EALREADY, that means that
771 * the reply has already been recieved by another
772 * process and we can return immediately. In this
773 * case, the lock is not taken to avoid races with
774 * other processes.
776 error = nfs_rcvlock(myrep);
777 if (error == EALREADY)
778 return (0);
779 if (error)
780 return (error);
782 * Get the next Rpc reply off the socket
784 error = nfs_receive(myrep, &nam, &mrep);
785 nfs_rcvunlock(myrep);
786 if (error) {
788 * Ignore routing errors on connectionless protocols??
790 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
791 nmp->nm_so->so_error = 0;
792 if (myrep->r_flags & R_GETONEREP)
793 return (0);
794 continue;
796 return (error);
798 if (nam)
799 FREE(nam, M_SONAME);
802 * Get the xid and check that it is an rpc reply
804 md = mrep;
805 dpos = mtod(md, caddr_t);
806 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED);
807 rxid = *tl++;
808 if (*tl != rpc_reply) {
809 nfsstats.rpcinvalid++;
810 m_freem(mrep);
811 nfsmout:
812 if (myrep->r_flags & R_GETONEREP)
813 return (0);
814 continue;
818 * Loop through the request list to match up the reply
819 * Iff no match, just drop the datagram. On match, set
820 * r_mrep atomically to prevent the timer from messing
821 * around with the request after we have exited the critical
822 * section.
824 crit_enter();
825 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
826 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
827 rep->r_mrep = mrep;
828 break;
831 crit_exit();
834 * Fill in the rest of the reply if we found a match.
836 if (rep) {
837 rep->r_md = md;
838 rep->r_dpos = dpos;
839 if (nfsrtton) {
840 struct rttl *rt;
842 rt = &nfsrtt.rttl[nfsrtt.pos];
843 rt->proc = rep->r_procnum;
844 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
845 rt->sent = nmp->nm_sent;
846 rt->cwnd = nmp->nm_cwnd;
847 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
848 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
849 rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid;
850 getmicrotime(&rt->tstamp);
851 if (rep->r_flags & R_TIMING)
852 rt->rtt = rep->r_rtt;
853 else
854 rt->rtt = 1000000;
855 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
858 * Update congestion window.
859 * Do the additive increase of
860 * one rpc/rtt.
862 if (nmp->nm_cwnd <= nmp->nm_sent) {
863 nmp->nm_cwnd +=
864 (NFS_CWNDSCALE * NFS_CWNDSCALE +
865 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
866 if (nmp->nm_cwnd > NFS_MAXCWND)
867 nmp->nm_cwnd = NFS_MAXCWND;
869 crit_enter(); /* nfs_timer interlock for nm_sent */
870 if (rep->r_flags & R_SENT) {
871 rep->r_flags &= ~R_SENT;
872 nmp->nm_sent -= NFS_CWNDSCALE;
874 crit_exit();
876 * Update rtt using a gain of 0.125 on the mean
877 * and a gain of 0.25 on the deviation.
879 if (rep->r_flags & R_TIMING) {
881 * Since the timer resolution of
882 * NFS_HZ is so course, it can often
883 * result in r_rtt == 0. Since
884 * r_rtt == N means that the actual
885 * rtt is between N+dt and N+2-dt ticks,
886 * add 1.
888 t1 = rep->r_rtt + 1;
889 t1 -= (NFS_SRTT(rep) >> 3);
890 NFS_SRTT(rep) += t1;
891 if (t1 < 0)
892 t1 = -t1;
893 t1 -= (NFS_SDRTT(rep) >> 2);
894 NFS_SDRTT(rep) += t1;
896 nmp->nm_timeouts = 0;
899 * If not matched to a request, drop it.
900 * If it's mine, get out.
902 if (rep == NULL) {
903 nfsstats.rpcunexpected++;
904 m_freem(mrep);
905 } else if (rep == myrep) {
906 if (rep->r_mrep == NULL)
907 panic("nfsreply nil");
908 return (0);
910 if (myrep->r_flags & R_GETONEREP)
911 return (0);
916 * nfs_request - goes something like this
917 * - fill in request struct
918 * - links it into list
919 * - calls nfs_send() for first transmit
920 * - calls nfs_receive() to get reply
921 * - break down rpc header and return with nfs reply pointed to
922 * by mrep or error
923 * nb: always frees up mreq mbuf list
926 nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
927 struct thread *td, struct ucred *cred, struct mbuf **mrp,
928 struct mbuf **mdp, caddr_t *dposp)
930 struct mbuf *mrep, *m2;
931 struct nfsreq *rep;
932 u_int32_t *tl;
933 int i;
934 struct nfsmount *nmp;
935 struct mbuf *m, *md, *mheadend;
936 char nickv[RPCX_NICKVERF];
937 time_t waituntil;
938 caddr_t dpos, cp2;
939 int t1, error = 0, mrest_len, auth_len, auth_type;
940 int trylater_delay = 15, trylater_cnt = 0, failed_auth = 0;
941 int verf_len, verf_type;
942 u_int32_t xid;
943 char *auth_str, *verf_str;
944 NFSKERBKEY_T key; /* save session key */
946 /* Reject requests while attempting a forced unmount. */
947 if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
948 m_freem(mrest);
949 return (ESTALE);
951 nmp = VFSTONFS(vp->v_mount);
952 MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
953 rep->r_nmp = nmp;
954 rep->r_vp = vp;
955 rep->r_td = td;
956 rep->r_procnum = procnum;
957 rep->r_mreq = NULL;
958 i = 0;
959 m = mrest;
960 while (m) {
961 i += m->m_len;
962 m = m->m_next;
964 mrest_len = i;
967 * Get the RPC header with authorization.
969 kerbauth:
970 verf_str = auth_str = (char *)0;
971 if (nmp->nm_flag & NFSMNT_KERB) {
972 verf_str = nickv;
973 verf_len = sizeof (nickv);
974 auth_type = RPCAUTH_KERB4;
975 bzero((caddr_t)key, sizeof (key));
976 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
977 &auth_len, verf_str, verf_len)) {
978 error = nfs_getauth(nmp, rep, cred, &auth_str,
979 &auth_len, verf_str, &verf_len, key);
980 if (error) {
981 kfree((caddr_t)rep, M_NFSREQ);
982 m_freem(mrest);
983 return (error);
986 } else {
987 auth_type = RPCAUTH_UNIX;
988 if (cred->cr_ngroups < 1)
989 panic("nfsreq nogrps");
990 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
991 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
992 5 * NFSX_UNSIGNED;
994 m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
995 auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid);
996 if (auth_str)
997 kfree(auth_str, M_TEMP);
1000 * For stream protocols, insert a Sun RPC Record Mark.
1002 if (nmp->nm_sotype == SOCK_STREAM) {
1003 M_PREPEND(m, NFSX_UNSIGNED, MB_WAIT);
1004 if (m == NULL) {
1005 kfree(rep, M_NFSREQ);
1006 return (ENOBUFS);
1008 *mtod(m, u_int32_t *) = htonl(0x80000000 |
1009 (m->m_pkthdr.len - NFSX_UNSIGNED));
1011 rep->r_mreq = m;
1012 rep->r_xid = xid;
1013 tryagain:
1014 if (nmp->nm_flag & NFSMNT_SOFT)
1015 rep->r_retry = nmp->nm_retry;
1016 else
1017 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
1018 rep->r_rtt = rep->r_rexmit = 0;
1019 if (proct[procnum] > 0)
1020 rep->r_flags = R_TIMING | R_MASKTIMER;
1021 else
1022 rep->r_flags = R_MASKTIMER;
1023 rep->r_mrep = NULL;
1026 * Do the client side RPC.
1028 nfsstats.rpcrequests++;
1031 * Chain request into list of outstanding requests. Be sure
1032 * to put it LAST so timer finds oldest requests first. Note
1033 * that R_MASKTIMER is set at the moment to prevent any timer
1034 * action on this request while we are still doing processing on
1035 * it below. splsoftclock() primarily protects nm_sent. Note
1036 * that we may block in this code so there is no atomicy guarentee.
1038 crit_enter();
1039 TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1042 * If backing off another request or avoiding congestion, don't
1043 * send this one now but let timer do it. If not timing a request,
1044 * do it now.
1046 * Even though the timer will not mess with our request there is
1047 * still the possibility that we will race a reply (which clears
1048 * R_SENT), especially on localhost connections, so be very careful
1049 * when setting R_SENT. We could set R_SENT prior to calling
1050 * nfs_send() but why bother if the response occurs that quickly?
1052 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1053 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1054 nmp->nm_sent < nmp->nm_cwnd)) {
1055 if (nmp->nm_soflags & PR_CONNREQUIRED)
1056 error = nfs_sndlock(rep);
1057 if (!error) {
1058 m2 = m_copym(m, 0, M_COPYALL, MB_WAIT);
1059 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1060 if (nmp->nm_soflags & PR_CONNREQUIRED)
1061 nfs_sndunlock(rep);
1063 if (!error && (rep->r_flags & R_MUSTRESEND) == 0 &&
1064 rep->r_mrep == NULL) {
1065 KASSERT((rep->r_flags & R_SENT) == 0,
1066 ("R_SENT ASSERT %p", rep));
1067 nmp->nm_sent += NFS_CWNDSCALE;
1068 rep->r_flags |= R_SENT;
1070 } else {
1071 rep->r_rtt = -1;
1075 * Let the timer do what it will with the request, then
1076 * wait for the reply from our send or the timer's.
1078 if (!error || error == EPIPE) {
1079 rep->r_flags &= ~R_MASKTIMER;
1080 crit_exit();
1081 error = nfs_reply(rep);
1082 crit_enter();
1086 * RPC done, unlink the request, but don't rip it out from under
1087 * the callout timer.
1089 while (rep->r_flags & R_LOCKED) {
1090 nfs_timer_raced = 1;
1091 tsleep(&nfs_timer_raced, 0, "nfstrac", 0);
1093 TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1096 * Decrement the outstanding request count.
1098 if (rep->r_flags & R_SENT) {
1099 rep->r_flags &= ~R_SENT;
1100 nmp->nm_sent -= NFS_CWNDSCALE;
1102 crit_exit();
1105 * If there was a successful reply and a tprintf msg.
1106 * tprintf a response.
1108 if (!error && (rep->r_flags & R_TPRINTFMSG))
1109 nfs_msg(rep->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
1110 "is alive again");
1111 mrep = rep->r_mrep;
1112 md = rep->r_md;
1113 dpos = rep->r_dpos;
1114 if (error) {
1115 m_freem(rep->r_mreq);
1116 kfree((caddr_t)rep, M_NFSREQ);
1117 return (error);
1121 * break down the rpc header and check if ok
1123 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1124 if (*tl++ == rpc_msgdenied) {
1125 if (*tl == rpc_mismatch)
1126 error = EOPNOTSUPP;
1127 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1128 if (!failed_auth) {
1129 failed_auth++;
1130 mheadend->m_next = (struct mbuf *)0;
1131 m_freem(mrep);
1132 m_freem(rep->r_mreq);
1133 goto kerbauth;
1134 } else
1135 error = EAUTH;
1136 } else
1137 error = EACCES;
1138 m_freem(mrep);
1139 m_freem(rep->r_mreq);
1140 kfree((caddr_t)rep, M_NFSREQ);
1141 return (error);
1145 * Grab any Kerberos verifier, otherwise just throw it away.
1147 verf_type = fxdr_unsigned(int, *tl++);
1148 i = fxdr_unsigned(int32_t, *tl);
1149 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1150 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1151 if (error)
1152 goto nfsmout;
1153 } else if (i > 0)
1154 nfsm_adv(nfsm_rndup(i));
1155 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1156 /* 0 == ok */
1157 if (*tl == 0) {
1158 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1159 if (*tl != 0) {
1160 error = fxdr_unsigned(int, *tl);
1161 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1162 error == NFSERR_TRYLATER) {
1163 m_freem(mrep);
1164 error = 0;
1165 waituntil = time_second + trylater_delay;
1166 while (time_second < waituntil)
1167 (void) tsleep((caddr_t)&lbolt,
1168 0, "nqnfstry", 0);
1169 trylater_delay *= nfs_backoff[trylater_cnt];
1170 if (trylater_cnt < 7)
1171 trylater_cnt++;
1172 goto tryagain;
1176 * If the File Handle was stale, invalidate the
1177 * lookup cache, just in case.
1179 * To avoid namecache<->vnode deadlocks we must
1180 * release the vnode lock if we hold it.
1182 if (error == ESTALE) {
1183 int ltype;
1185 ltype = lockstatus(&vp->v_lock, curthread);
1186 if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED)
1187 lockmgr(&vp->v_lock, LK_RELEASE);
1188 cache_inval_vp(vp, CINV_CHILDREN);
1189 if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED)
1190 lockmgr(&vp->v_lock, ltype);
1192 if (nmp->nm_flag & NFSMNT_NFSV3) {
1193 *mrp = mrep;
1194 *mdp = md;
1195 *dposp = dpos;
1196 error |= NFSERR_RETERR;
1197 } else
1198 m_freem(mrep);
1199 m_freem(rep->r_mreq);
1200 kfree((caddr_t)rep, M_NFSREQ);
1201 return (error);
1204 *mrp = mrep;
1205 *mdp = md;
1206 *dposp = dpos;
1207 m_freem(rep->r_mreq);
1208 FREE((caddr_t)rep, M_NFSREQ);
1209 return (0);
1211 m_freem(mrep);
1212 error = EPROTONOSUPPORT;
1213 nfsmout:
1214 m_freem(rep->r_mreq);
1215 kfree((caddr_t)rep, M_NFSREQ);
1216 return (error);
1219 #ifndef NFS_NOSERVER
1221 * Generate the rpc reply header
1222 * siz arg. is used to decide if adding a cluster is worthwhile
1225 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
1226 int err, struct mbuf **mrq, struct mbuf **mbp, caddr_t *bposp)
1228 u_int32_t *tl;
1229 struct mbuf *mreq;
1230 caddr_t bpos;
1231 struct mbuf *mb, *mb2;
1233 siz += RPC_REPLYSIZ;
1234 mb = mreq = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL);
1235 mreq->m_pkthdr.len = 0;
1237 * If this is not a cluster, try and leave leading space
1238 * for the lower level headers.
1240 if ((max_hdr + siz) < MINCLSIZE)
1241 mreq->m_data += max_hdr;
1242 tl = mtod(mreq, u_int32_t *);
1243 mreq->m_len = 6 * NFSX_UNSIGNED;
1244 bpos = ((caddr_t)tl) + mreq->m_len;
1245 *tl++ = txdr_unsigned(nd->nd_retxid);
1246 *tl++ = rpc_reply;
1247 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1248 *tl++ = rpc_msgdenied;
1249 if (err & NFSERR_AUTHERR) {
1250 *tl++ = rpc_autherr;
1251 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1252 mreq->m_len -= NFSX_UNSIGNED;
1253 bpos -= NFSX_UNSIGNED;
1254 } else {
1255 *tl++ = rpc_mismatch;
1256 *tl++ = txdr_unsigned(RPC_VER2);
1257 *tl = txdr_unsigned(RPC_VER2);
1259 } else {
1260 *tl++ = rpc_msgaccepted;
1263 * For Kerberos authentication, we must send the nickname
1264 * verifier back, otherwise just RPCAUTH_NULL.
1266 if (nd->nd_flag & ND_KERBFULL) {
1267 struct nfsuid *nuidp;
1268 struct timeval ktvin, ktvout;
1270 for (nuidp = NUIDHASH(slp, nd->nd_cr.cr_uid)->lh_first;
1271 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1272 if (nuidp->nu_cr.cr_uid == nd->nd_cr.cr_uid &&
1273 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1274 &nuidp->nu_haddr, nd->nd_nam2)))
1275 break;
1277 if (nuidp) {
1278 ktvin.tv_sec =
1279 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1280 ktvin.tv_usec =
1281 txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1284 * Encrypt the timestamp in ecb mode using the
1285 * session key.
1287 #ifdef NFSKERB
1289 #endif
1291 *tl++ = rpc_auth_kerb;
1292 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1293 *tl = ktvout.tv_sec;
1294 nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1295 *tl++ = ktvout.tv_usec;
1296 *tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid);
1297 } else {
1298 *tl++ = 0;
1299 *tl++ = 0;
1301 } else {
1302 *tl++ = 0;
1303 *tl++ = 0;
1305 switch (err) {
1306 case EPROGUNAVAIL:
1307 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1308 break;
1309 case EPROGMISMATCH:
1310 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1311 nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1312 *tl++ = txdr_unsigned(2);
1313 *tl = txdr_unsigned(3);
1314 break;
1315 case EPROCUNAVAIL:
1316 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1317 break;
1318 case EBADRPC:
1319 *tl = txdr_unsigned(RPC_GARBAGE);
1320 break;
1321 default:
1322 *tl = 0;
1323 if (err != NFSERR_RETVOID) {
1324 nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED);
1325 if (err)
1326 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1327 else
1328 *tl = 0;
1330 break;
1334 if (mrq != NULL)
1335 *mrq = mreq;
1336 *mbp = mb;
1337 *bposp = bpos;
1338 if (err != 0 && err != NFSERR_RETVOID)
1339 nfsstats.srvrpc_errs++;
1340 return (0);
1344 #endif /* NFS_NOSERVER */
1346 * Nfs timer routine
1347 * Scan the nfsreq list and retranmit any requests that have timed out
1348 * To avoid retransmission attempts on STREAM sockets (in the future) make
1349 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1351 void
1352 nfs_timer(void *arg /* never used */)
1354 struct nfsreq *rep;
1355 struct mbuf *m;
1356 struct socket *so;
1357 struct nfsmount *nmp;
1358 int timeo;
1359 int error;
1360 #ifndef NFS_NOSERVER
1361 struct nfssvc_sock *slp;
1362 u_quad_t cur_usec;
1363 #endif /* NFS_NOSERVER */
1364 struct thread *td = &thread0; /* XXX for credentials, will break if sleep */
1366 crit_enter();
1367 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1368 nmp = rep->r_nmp;
1369 if (rep->r_mrep || (rep->r_flags & (R_SOFTTERM|R_MASKTIMER)))
1370 continue;
1371 rep->r_flags |= R_LOCKED;
1372 if (nfs_sigintr(nmp, rep, rep->r_td)) {
1373 nfs_softterm(rep);
1374 goto skip;
1376 if (rep->r_rtt >= 0) {
1377 rep->r_rtt++;
1378 if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1379 timeo = nmp->nm_timeo;
1380 else
1381 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1382 if (nmp->nm_timeouts > 0)
1383 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1384 if (rep->r_rtt <= timeo)
1385 goto skip;
1386 if (nmp->nm_timeouts < 8)
1387 nmp->nm_timeouts++;
1390 * Check for server not responding
1392 if ((rep->r_flags & R_TPRINTFMSG) == 0 &&
1393 rep->r_rexmit > nmp->nm_deadthresh) {
1394 nfs_msg(rep->r_td,
1395 nmp->nm_mountp->mnt_stat.f_mntfromname,
1396 "not responding");
1397 rep->r_flags |= R_TPRINTFMSG;
1399 if (rep->r_rexmit >= rep->r_retry) { /* too many */
1400 nfsstats.rpctimeouts++;
1401 nfs_softterm(rep);
1402 goto skip;
1404 if (nmp->nm_sotype != SOCK_DGRAM) {
1405 if (++rep->r_rexmit > NFS_MAXREXMIT)
1406 rep->r_rexmit = NFS_MAXREXMIT;
1407 goto skip;
1409 if ((so = nmp->nm_so) == NULL)
1410 goto skip;
1413 * If there is enough space and the window allows..
1414 * Resend it
1415 * Set r_rtt to -1 in case we fail to send it now.
1417 rep->r_rtt = -1;
1418 if (sbspace(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
1419 ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1420 (rep->r_flags & R_SENT) ||
1421 nmp->nm_sent < nmp->nm_cwnd) &&
1422 (m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){
1423 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
1424 error = so_pru_send(so, 0, m, (struct sockaddr *)0,
1425 (struct mbuf *)0, td);
1426 else
1427 error = so_pru_send(so, 0, m, nmp->nm_nam,
1428 (struct mbuf *)0, td);
1429 if (error) {
1430 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
1431 so->so_error = 0;
1432 } else if (rep->r_mrep == NULL) {
1434 * Iff first send, start timing
1435 * else turn timing off, backoff timer
1436 * and divide congestion window by 2.
1438 * It is possible for the so_pru_send() to
1439 * block and for us to race a reply so we
1440 * only do this if the reply field has not
1441 * been filled in. R_LOCKED will prevent
1442 * the request from being ripped out from under
1443 * us entirely.
1445 if (rep->r_flags & R_SENT) {
1446 rep->r_flags &= ~R_TIMING;
1447 if (++rep->r_rexmit > NFS_MAXREXMIT)
1448 rep->r_rexmit = NFS_MAXREXMIT;
1449 nmp->nm_cwnd >>= 1;
1450 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1451 nmp->nm_cwnd = NFS_CWNDSCALE;
1452 nfsstats.rpcretries++;
1453 } else {
1454 rep->r_flags |= R_SENT;
1455 nmp->nm_sent += NFS_CWNDSCALE;
1457 rep->r_rtt = 0;
1460 skip:
1461 rep->r_flags &= ~R_LOCKED;
1463 #ifndef NFS_NOSERVER
1466 * Scan the write gathering queues for writes that need to be
1467 * completed now.
1469 cur_usec = nfs_curusec();
1470 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1471 if (slp->ns_tq.lh_first && slp->ns_tq.lh_first->nd_time<=cur_usec)
1472 nfsrv_wakenfsd(slp, 1);
1474 #endif /* NFS_NOSERVER */
1477 * Due to possible blocking, a client operation may be waiting for
1478 * us to finish processing this request so it can remove it.
1480 if (nfs_timer_raced) {
1481 nfs_timer_raced = 0;
1482 wakeup(&nfs_timer_raced);
1484 crit_exit();
1485 callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer, NULL);
1489 * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1490 * wait for all requests to complete. This is used by forced unmounts
1491 * to terminate any outstanding RPCs.
1494 nfs_nmcancelreqs(struct nfsmount *nmp)
1496 struct nfsreq *req;
1497 int i;
1499 crit_enter();
1500 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1501 if (nmp != req->r_nmp || req->r_mrep != NULL ||
1502 (req->r_flags & R_SOFTTERM)) {
1503 continue;
1505 nfs_softterm(req);
1507 crit_exit();
1509 for (i = 0; i < 30; i++) {
1510 crit_enter();
1511 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1512 if (nmp == req->r_nmp)
1513 break;
1515 crit_exit();
1516 if (req == NULL)
1517 return (0);
1518 tsleep(&lbolt, 0, "nfscancel", 0);
1520 return (EBUSY);
1524 * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
1525 * The nm_send count is decremented now to avoid deadlocks when the process in
1526 * soreceive() hasn't yet managed to send its own request.
1528 * This routine must be called at splsoftclock() to protect r_flags and
1529 * nm_sent.
1532 static void
1533 nfs_softterm(struct nfsreq *rep)
1535 rep->r_flags |= R_SOFTTERM;
1537 if (rep->r_flags & R_SENT) {
1538 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1539 rep->r_flags &= ~R_SENT;
1544 * Test for a termination condition pending on the process.
1545 * This is used for NFSMNT_INT mounts.
1548 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
1550 sigset_t tmpset;
1551 struct proc *p;
1552 struct lwp *lp;
1554 if (rep && (rep->r_flags & R_SOFTTERM))
1555 return (EINTR);
1556 /* Terminate all requests while attempting a forced unmount. */
1557 if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
1558 return (EINTR);
1559 if (!(nmp->nm_flag & NFSMNT_INT))
1560 return (0);
1561 /* td might be NULL YYY */
1562 if (td == NULL || (p = td->td_proc) == NULL)
1563 return (0);
1565 lp = td->td_lwp;
1566 tmpset = lwp_sigpend(lp);
1567 SIGSETNAND(tmpset, lp->lwp_sigmask);
1568 SIGSETNAND(tmpset, p->p_sigignore);
1569 if (SIGNOTEMPTY(tmpset) && NFSINT_SIGMASK(tmpset))
1570 return (EINTR);
1572 return (0);
1576 * Lock a socket against others.
1577 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1578 * and also to avoid race conditions between the processes with nfs requests
1579 * in progress when a reconnect is necessary.
1582 nfs_sndlock(struct nfsreq *rep)
1584 int *statep = &rep->r_nmp->nm_state;
1585 struct thread *td;
1586 int slptimeo;
1587 int slpflag;
1588 int error;
1590 slpflag = 0;
1591 slptimeo = 0;
1592 td = rep->r_td;
1593 if (rep->r_nmp->nm_flag & NFSMNT_INT)
1594 slpflag = PCATCH;
1596 error = 0;
1597 crit_enter();
1598 while (*statep & NFSSTA_SNDLOCK) {
1599 *statep |= NFSSTA_WANTSND;
1600 if (nfs_sigintr(rep->r_nmp, rep, td)) {
1601 error = EINTR;
1602 break;
1604 tsleep((caddr_t)statep, slpflag, "nfsndlck", slptimeo);
1605 if (slpflag == PCATCH) {
1606 slpflag = 0;
1607 slptimeo = 2 * hz;
1610 /* Always fail if our request has been cancelled. */
1611 if ((rep->r_flags & R_SOFTTERM))
1612 error = EINTR;
1613 if (error == 0)
1614 *statep |= NFSSTA_SNDLOCK;
1615 crit_exit();
1616 return (error);
1620 * Unlock the stream socket for others.
1622 void
1623 nfs_sndunlock(struct nfsreq *rep)
1625 int *statep = &rep->r_nmp->nm_state;
1627 if ((*statep & NFSSTA_SNDLOCK) == 0)
1628 panic("nfs sndunlock");
1629 crit_enter();
1630 *statep &= ~NFSSTA_SNDLOCK;
1631 if (*statep & NFSSTA_WANTSND) {
1632 *statep &= ~NFSSTA_WANTSND;
1633 wakeup((caddr_t)statep);
1635 crit_exit();
1638 static int
1639 nfs_rcvlock(struct nfsreq *rep)
1641 int *statep = &rep->r_nmp->nm_state;
1642 int slpflag;
1643 int slptimeo;
1644 int error;
1647 * Unconditionally check for completion in case another nfsiod
1648 * get the packet while the caller was blocked, before the caller
1649 * called us. Packet reception is handled by mainline code which
1650 * is protected by the BGL at the moment.
1652 * We do not strictly need the second check just before the
1653 * tsleep(), but it's good defensive programming.
1655 if (rep->r_mrep != NULL)
1656 return (EALREADY);
1658 if (rep->r_nmp->nm_flag & NFSMNT_INT)
1659 slpflag = PCATCH;
1660 else
1661 slpflag = 0;
1662 slptimeo = 0;
1663 error = 0;
1664 crit_enter();
1665 while (*statep & NFSSTA_RCVLOCK) {
1666 if (nfs_sigintr(rep->r_nmp, rep, rep->r_td)) {
1667 error = EINTR;
1668 break;
1670 if (rep->r_mrep != NULL) {
1671 error = EALREADY;
1672 break;
1674 *statep |= NFSSTA_WANTRCV;
1675 tsleep((caddr_t)statep, slpflag, "nfsrcvlk", slptimeo);
1677 * If our reply was recieved while we were sleeping,
1678 * then just return without taking the lock to avoid a
1679 * situation where a single iod could 'capture' the
1680 * recieve lock.
1682 if (rep->r_mrep != NULL) {
1683 error = EALREADY;
1684 break;
1686 if (slpflag == PCATCH) {
1687 slpflag = 0;
1688 slptimeo = 2 * hz;
1691 if (error == 0) {
1692 *statep |= NFSSTA_RCVLOCK;
1693 rep->r_nmp->nm_rcvlock_td = curthread; /* DEBUGGING */
1695 crit_exit();
1696 return (error);
1700 * Unlock the stream socket for others.
1702 static void
1703 nfs_rcvunlock(struct nfsreq *rep)
1705 int *statep = &rep->r_nmp->nm_state;
1707 if ((*statep & NFSSTA_RCVLOCK) == 0)
1708 panic("nfs rcvunlock");
1709 crit_enter();
1710 rep->r_nmp->nm_rcvlock_td = (void *)-1; /* DEBUGGING */
1711 *statep &= ~NFSSTA_RCVLOCK;
1712 if (*statep & NFSSTA_WANTRCV) {
1713 *statep &= ~NFSSTA_WANTRCV;
1714 wakeup((caddr_t)statep);
1716 crit_exit();
1720 * nfs_realign:
1722 * Check for badly aligned mbuf data and realign by copying the unaligned
1723 * portion of the data into a new mbuf chain and freeing the portions
1724 * of the old chain that were replaced.
1726 * We cannot simply realign the data within the existing mbuf chain
1727 * because the underlying buffers may contain other rpc commands and
1728 * we cannot afford to overwrite them.
1730 * We would prefer to avoid this situation entirely. The situation does
1731 * not occur with NFS/UDP and is supposed to only occassionally occur
1732 * with TCP. Use vfs.nfs.realign_count and realign_test to check this.
1734 static void
1735 nfs_realign(struct mbuf **pm, int hsiz)
1737 struct mbuf *m;
1738 struct mbuf *n = NULL;
1739 int off = 0;
1741 ++nfs_realign_test;
1743 while ((m = *pm) != NULL) {
1744 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
1745 n = m_getl(m->m_len, MB_WAIT, MT_DATA, 0, NULL);
1746 n->m_len = 0;
1747 break;
1749 pm = &m->m_next;
1753 * If n is non-NULL, loop on m copying data, then replace the
1754 * portion of the chain that had to be realigned.
1756 if (n != NULL) {
1757 ++nfs_realign_count;
1758 while (m) {
1759 m_copyback(n, off, m->m_len, mtod(m, caddr_t));
1760 off += m->m_len;
1761 m = m->m_next;
1763 m_freem(*pm);
1764 *pm = n;
1768 #ifndef NFS_NOSERVER
1771 * Parse an RPC request
1772 * - verify it
1773 * - fill in the cred struct.
1776 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
1778 int len, i;
1779 u_int32_t *tl;
1780 int32_t t1;
1781 struct uio uio;
1782 struct iovec iov;
1783 caddr_t dpos, cp2, cp;
1784 u_int32_t nfsvers, auth_type;
1785 uid_t nickuid;
1786 int error = 0, ticklen;
1787 struct mbuf *mrep, *md;
1788 struct nfsuid *nuidp;
1789 struct timeval tvin, tvout;
1790 #if 0 /* until encrypted keys are implemented */
1791 NFSKERBKEYSCHED_T keys; /* stores key schedule */
1792 #endif
1794 mrep = nd->nd_mrep;
1795 md = nd->nd_md;
1796 dpos = nd->nd_dpos;
1797 if (has_header) {
1798 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED);
1799 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++);
1800 if (*tl++ != rpc_call) {
1801 m_freem(mrep);
1802 return (EBADRPC);
1804 } else
1805 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED);
1806 nd->nd_repstat = 0;
1807 nd->nd_flag = 0;
1808 if (*tl++ != rpc_vers) {
1809 nd->nd_repstat = ERPCMISMATCH;
1810 nd->nd_procnum = NFSPROC_NOOP;
1811 return (0);
1813 if (*tl != nfs_prog) {
1814 nd->nd_repstat = EPROGUNAVAIL;
1815 nd->nd_procnum = NFSPROC_NOOP;
1816 return (0);
1818 tl++;
1819 nfsvers = fxdr_unsigned(u_int32_t, *tl++);
1820 if (nfsvers < NFS_VER2 || nfsvers > NFS_VER3) {
1821 nd->nd_repstat = EPROGMISMATCH;
1822 nd->nd_procnum = NFSPROC_NOOP;
1823 return (0);
1825 if (nfsvers == NFS_VER3)
1826 nd->nd_flag = ND_NFSV3;
1827 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++);
1828 if (nd->nd_procnum == NFSPROC_NULL)
1829 return (0);
1830 if (nd->nd_procnum >= NFS_NPROCS ||
1831 (nd->nd_procnum >= NQNFSPROC_GETLEASE) ||
1832 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
1833 nd->nd_repstat = EPROCUNAVAIL;
1834 nd->nd_procnum = NFSPROC_NOOP;
1835 return (0);
1837 if ((nd->nd_flag & ND_NFSV3) == 0)
1838 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
1839 auth_type = *tl++;
1840 len = fxdr_unsigned(int, *tl++);
1841 if (len < 0 || len > RPCAUTH_MAXSIZ) {
1842 m_freem(mrep);
1843 return (EBADRPC);
1846 nd->nd_flag &= ~ND_KERBAUTH;
1848 * Handle auth_unix or auth_kerb.
1850 if (auth_type == rpc_auth_unix) {
1851 len = fxdr_unsigned(int, *++tl);
1852 if (len < 0 || len > NFS_MAXNAMLEN) {
1853 m_freem(mrep);
1854 return (EBADRPC);
1856 nfsm_adv(nfsm_rndup(len));
1857 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1858 bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred));
1859 nd->nd_cr.cr_ref = 1;
1860 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
1861 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
1862 len = fxdr_unsigned(int, *tl);
1863 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
1864 m_freem(mrep);
1865 return (EBADRPC);
1867 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED);
1868 for (i = 1; i <= len; i++)
1869 if (i < NGROUPS)
1870 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
1871 else
1872 tl++;
1873 nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
1874 if (nd->nd_cr.cr_ngroups > 1)
1875 nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups);
1876 len = fxdr_unsigned(int, *++tl);
1877 if (len < 0 || len > RPCAUTH_MAXSIZ) {
1878 m_freem(mrep);
1879 return (EBADRPC);
1881 if (len > 0)
1882 nfsm_adv(nfsm_rndup(len));
1883 } else if (auth_type == rpc_auth_kerb) {
1884 switch (fxdr_unsigned(int, *tl++)) {
1885 case RPCAKN_FULLNAME:
1886 ticklen = fxdr_unsigned(int, *tl);
1887 *((u_int32_t *)nfsd->nfsd_authstr) = *tl;
1888 uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED;
1889 nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED;
1890 if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) {
1891 m_freem(mrep);
1892 return (EBADRPC);
1894 uio.uio_offset = 0;
1895 uio.uio_iov = &iov;
1896 uio.uio_iovcnt = 1;
1897 uio.uio_segflg = UIO_SYSSPACE;
1898 iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4];
1899 iov.iov_len = RPCAUTH_MAXSIZ - 4;
1900 nfsm_mtouio(&uio, uio.uio_resid);
1901 nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1902 if (*tl++ != rpc_auth_kerb ||
1903 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
1904 kprintf("Bad kerb verifier\n");
1905 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1906 nd->nd_procnum = NFSPROC_NOOP;
1907 return (0);
1909 nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
1910 tl = (u_int32_t *)cp;
1911 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
1912 kprintf("Not fullname kerb verifier\n");
1913 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1914 nd->nd_procnum = NFSPROC_NOOP;
1915 return (0);
1917 cp += NFSX_UNSIGNED;
1918 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
1919 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
1920 nd->nd_flag |= ND_KERBFULL;
1921 nfsd->nfsd_flag |= NFSD_NEEDAUTH;
1922 break;
1923 case RPCAKN_NICKNAME:
1924 if (len != 2 * NFSX_UNSIGNED) {
1925 kprintf("Kerb nickname short\n");
1926 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
1927 nd->nd_procnum = NFSPROC_NOOP;
1928 return (0);
1930 nickuid = fxdr_unsigned(uid_t, *tl);
1931 nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1932 if (*tl++ != rpc_auth_kerb ||
1933 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
1934 kprintf("Kerb nick verifier bad\n");
1935 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1936 nd->nd_procnum = NFSPROC_NOOP;
1937 return (0);
1939 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1940 tvin.tv_sec = *tl++;
1941 tvin.tv_usec = *tl;
1943 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
1944 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1945 if (nuidp->nu_cr.cr_uid == nickuid &&
1946 (!nd->nd_nam2 ||
1947 netaddr_match(NU_NETFAM(nuidp),
1948 &nuidp->nu_haddr, nd->nd_nam2)))
1949 break;
1951 if (!nuidp) {
1952 nd->nd_repstat =
1953 (NFSERR_AUTHERR|AUTH_REJECTCRED);
1954 nd->nd_procnum = NFSPROC_NOOP;
1955 return (0);
1959 * Now, decrypt the timestamp using the session key
1960 * and validate it.
1962 #ifdef NFSKERB
1964 #endif
1966 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
1967 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
1968 if (nuidp->nu_expire < time_second ||
1969 nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
1970 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
1971 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
1972 nuidp->nu_expire = 0;
1973 nd->nd_repstat =
1974 (NFSERR_AUTHERR|AUTH_REJECTVERF);
1975 nd->nd_procnum = NFSPROC_NOOP;
1976 return (0);
1978 nfsrv_setcred(&nuidp->nu_cr, &nd->nd_cr);
1979 nd->nd_flag |= ND_KERBNICK;
1981 } else {
1982 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
1983 nd->nd_procnum = NFSPROC_NOOP;
1984 return (0);
1987 nd->nd_md = md;
1988 nd->nd_dpos = dpos;
1989 return (0);
1990 nfsmout:
1991 return (error);
1994 #endif
1997 * Send a message to the originating process's terminal. The thread and/or
1998 * process may be NULL. YYY the thread should not be NULL but there may
1999 * still be some uio_td's that are still being passed as NULL through to
2000 * nfsm_request().
2002 static int
2003 nfs_msg(struct thread *td, char *server, char *msg)
2005 tpr_t tpr;
2007 if (td && td->td_proc)
2008 tpr = tprintf_open(td->td_proc);
2009 else
2010 tpr = NULL;
2011 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2012 tprintf_close(tpr);
2013 return (0);
2016 #ifndef NFS_NOSERVER
2018 * Socket upcall routine for the nfsd sockets.
2019 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2020 * Essentially do as much as possible non-blocking, else punt and it will
2021 * be called with MB_WAIT from an nfsd.
2023 void
2024 nfsrv_rcv(struct socket *so, void *arg, int waitflag)
2026 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2027 struct mbuf *m;
2028 struct mbuf *mp;
2029 struct sockaddr *nam;
2030 struct uio auio;
2031 int flags, error;
2032 int nparallel_wakeup = 0;
2034 if ((slp->ns_flag & SLP_VALID) == 0)
2035 return;
2038 * Do not allow an infinite number of completed RPC records to build
2039 * up before we stop reading data from the socket. Otherwise we could
2040 * end up holding onto an unreasonable number of mbufs for requests
2041 * waiting for service.
2043 * This should give pretty good feedback to the TCP
2044 * layer and prevents a memory crunch for other protocols.
2046 * Note that the same service socket can be dispatched to several
2047 * nfs servers simultaniously.
2049 * the tcp protocol callback calls us with MB_DONTWAIT.
2050 * nfsd calls us with MB_WAIT (typically).
2052 if (waitflag == MB_DONTWAIT && slp->ns_numrec >= nfsd_waiting / 2 + 1) {
2053 slp->ns_flag |= SLP_NEEDQ;
2054 goto dorecs;
2058 * Handle protocol specifics to parse an RPC request. We always
2059 * pull from the socket using non-blocking I/O.
2061 auio.uio_td = NULL;
2062 if (so->so_type == SOCK_STREAM) {
2064 * The data has to be read in an orderly fashion from a TCP
2065 * stream, unlike a UDP socket. It is possible for soreceive
2066 * and/or nfsrv_getstream() to block, so make sure only one
2067 * entity is messing around with the TCP stream at any given
2068 * moment. The receive sockbuf's lock in soreceive is not
2069 * sufficient.
2071 * Note that this procedure can be called from any number of
2072 * NFS severs *OR* can be upcalled directly from a TCP
2073 * protocol thread.
2075 if (slp->ns_flag & SLP_GETSTREAM) {
2076 slp->ns_flag |= SLP_NEEDQ;
2077 goto dorecs;
2079 slp->ns_flag |= SLP_GETSTREAM;
2082 * Do soreceive().
2084 auio.uio_resid = 1000000000;
2085 flags = MSG_DONTWAIT;
2086 error = so_pru_soreceive(so, &nam, &auio, &mp, NULL, &flags);
2087 if (error || mp == (struct mbuf *)0) {
2088 if (error == EWOULDBLOCK)
2089 slp->ns_flag |= SLP_NEEDQ;
2090 else
2091 slp->ns_flag |= SLP_DISCONN;
2092 slp->ns_flag &= ~SLP_GETSTREAM;
2093 goto dorecs;
2095 m = mp;
2096 if (slp->ns_rawend) {
2097 slp->ns_rawend->m_next = m;
2098 slp->ns_cc += 1000000000 - auio.uio_resid;
2099 } else {
2100 slp->ns_raw = m;
2101 slp->ns_cc = 1000000000 - auio.uio_resid;
2103 while (m->m_next)
2104 m = m->m_next;
2105 slp->ns_rawend = m;
2108 * Now try and parse as many record(s) as we can out of the
2109 * raw stream data.
2111 error = nfsrv_getstream(slp, waitflag, &nparallel_wakeup);
2112 if (error) {
2113 if (error == EPERM)
2114 slp->ns_flag |= SLP_DISCONN;
2115 else
2116 slp->ns_flag |= SLP_NEEDQ;
2118 slp->ns_flag &= ~SLP_GETSTREAM;
2119 } else {
2121 * For UDP soreceive typically pulls just one packet, loop
2122 * to get the whole batch.
2124 do {
2125 auio.uio_resid = 1000000000;
2126 flags = MSG_DONTWAIT;
2127 error = so_pru_soreceive(so, &nam, &auio, &mp, NULL,
2128 &flags);
2129 if (mp) {
2130 struct nfsrv_rec *rec;
2131 int mf = (waitflag & MB_DONTWAIT) ?
2132 M_NOWAIT : M_WAITOK;
2133 rec = kmalloc(sizeof(struct nfsrv_rec),
2134 M_NFSRVDESC, mf);
2135 if (!rec) {
2136 if (nam)
2137 FREE(nam, M_SONAME);
2138 m_freem(mp);
2139 continue;
2141 nfs_realign(&mp, 10 * NFSX_UNSIGNED);
2142 rec->nr_address = nam;
2143 rec->nr_packet = mp;
2144 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2145 ++slp->ns_numrec;
2146 ++nparallel_wakeup;
2148 if (error) {
2149 if ((so->so_proto->pr_flags & PR_CONNREQUIRED)
2150 && error != EWOULDBLOCK) {
2151 slp->ns_flag |= SLP_DISCONN;
2152 goto dorecs;
2155 } while (mp);
2159 * If we were upcalled from the tcp protocol layer and we have
2160 * fully parsed records ready to go, or there is new data pending,
2161 * or something went wrong, try to wake up an nfsd thread to deal
2162 * with it.
2164 dorecs:
2165 if (waitflag == MB_DONTWAIT && (slp->ns_numrec > 0
2166 || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) {
2167 nfsrv_wakenfsd(slp, nparallel_wakeup);
2172 * Try and extract an RPC request from the mbuf data list received on a
2173 * stream socket. The "waitflag" argument indicates whether or not it
2174 * can sleep.
2176 static int
2177 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag, int *countp)
2179 struct mbuf *m, **mpp;
2180 char *cp1, *cp2;
2181 int len;
2182 struct mbuf *om, *m2, *recm;
2183 u_int32_t recmark;
2185 for (;;) {
2186 if (slp->ns_reclen == 0) {
2187 if (slp->ns_cc < NFSX_UNSIGNED)
2188 return (0);
2189 m = slp->ns_raw;
2190 if (m->m_len >= NFSX_UNSIGNED) {
2191 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED);
2192 m->m_data += NFSX_UNSIGNED;
2193 m->m_len -= NFSX_UNSIGNED;
2194 } else {
2195 cp1 = (caddr_t)&recmark;
2196 cp2 = mtod(m, caddr_t);
2197 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2198 while (m->m_len == 0) {
2199 m = m->m_next;
2200 cp2 = mtod(m, caddr_t);
2202 *cp1++ = *cp2++;
2203 m->m_data++;
2204 m->m_len--;
2207 slp->ns_cc -= NFSX_UNSIGNED;
2208 recmark = ntohl(recmark);
2209 slp->ns_reclen = recmark & ~0x80000000;
2210 if (recmark & 0x80000000)
2211 slp->ns_flag |= SLP_LASTFRAG;
2212 else
2213 slp->ns_flag &= ~SLP_LASTFRAG;
2214 if (slp->ns_reclen > NFS_MAXPACKET || slp->ns_reclen <= 0) {
2215 log(LOG_ERR, "%s (%d) from nfs client\n",
2216 "impossible packet length",
2217 slp->ns_reclen);
2218 return (EPERM);
2223 * Now get the record part.
2225 * Note that slp->ns_reclen may be 0. Linux sometimes
2226 * generates 0-length RPCs
2228 recm = NULL;
2229 if (slp->ns_cc == slp->ns_reclen) {
2230 recm = slp->ns_raw;
2231 slp->ns_raw = slp->ns_rawend = (struct mbuf *)0;
2232 slp->ns_cc = slp->ns_reclen = 0;
2233 } else if (slp->ns_cc > slp->ns_reclen) {
2234 len = 0;
2235 m = slp->ns_raw;
2236 om = (struct mbuf *)0;
2238 while (len < slp->ns_reclen) {
2239 if ((len + m->m_len) > slp->ns_reclen) {
2240 m2 = m_copym(m, 0, slp->ns_reclen - len,
2241 waitflag);
2242 if (m2) {
2243 if (om) {
2244 om->m_next = m2;
2245 recm = slp->ns_raw;
2246 } else
2247 recm = m2;
2248 m->m_data += slp->ns_reclen - len;
2249 m->m_len -= slp->ns_reclen - len;
2250 len = slp->ns_reclen;
2251 } else {
2252 return (EWOULDBLOCK);
2254 } else if ((len + m->m_len) == slp->ns_reclen) {
2255 om = m;
2256 len += m->m_len;
2257 m = m->m_next;
2258 recm = slp->ns_raw;
2259 om->m_next = (struct mbuf *)0;
2260 } else {
2261 om = m;
2262 len += m->m_len;
2263 m = m->m_next;
2266 slp->ns_raw = m;
2267 slp->ns_cc -= len;
2268 slp->ns_reclen = 0;
2269 } else {
2270 return (0);
2274 * Accumulate the fragments into a record.
2276 mpp = &slp->ns_frag;
2277 while (*mpp)
2278 mpp = &((*mpp)->m_next);
2279 *mpp = recm;
2280 if (slp->ns_flag & SLP_LASTFRAG) {
2281 struct nfsrv_rec *rec;
2282 int mf = (waitflag & MB_DONTWAIT) ? M_NOWAIT : M_WAITOK;
2283 rec = kmalloc(sizeof(struct nfsrv_rec), M_NFSRVDESC, mf);
2284 if (!rec) {
2285 m_freem(slp->ns_frag);
2286 } else {
2287 nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
2288 rec->nr_address = (struct sockaddr *)0;
2289 rec->nr_packet = slp->ns_frag;
2290 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2291 ++slp->ns_numrec;
2292 ++*countp;
2294 slp->ns_frag = (struct mbuf *)0;
2300 * Parse an RPC header.
2303 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd,
2304 struct nfsrv_descript **ndp)
2306 struct nfsrv_rec *rec;
2307 struct mbuf *m;
2308 struct sockaddr *nam;
2309 struct nfsrv_descript *nd;
2310 int error;
2312 *ndp = NULL;
2313 if ((slp->ns_flag & SLP_VALID) == 0 || !STAILQ_FIRST(&slp->ns_rec))
2314 return (ENOBUFS);
2315 rec = STAILQ_FIRST(&slp->ns_rec);
2316 STAILQ_REMOVE_HEAD(&slp->ns_rec, nr_link);
2317 KKASSERT(slp->ns_numrec > 0);
2318 --slp->ns_numrec;
2319 nam = rec->nr_address;
2320 m = rec->nr_packet;
2321 kfree(rec, M_NFSRVDESC);
2322 MALLOC(nd, struct nfsrv_descript *, sizeof (struct nfsrv_descript),
2323 M_NFSRVDESC, M_WAITOK);
2324 nd->nd_md = nd->nd_mrep = m;
2325 nd->nd_nam2 = nam;
2326 nd->nd_dpos = mtod(m, caddr_t);
2327 error = nfs_getreq(nd, nfsd, TRUE);
2328 if (error) {
2329 if (nam) {
2330 FREE(nam, M_SONAME);
2332 kfree((caddr_t)nd, M_NFSRVDESC);
2333 return (error);
2335 *ndp = nd;
2336 nfsd->nfsd_nd = nd;
2337 return (0);
2341 * Try to assign service sockets to nfsd threads based on the number
2342 * of new rpc requests that have been queued on the service socket.
2344 * If no nfsd's are available or additonal requests are pending, set the
2345 * NFSD_CHECKSLP flag so that one of the running nfsds will go look for
2346 * the work in the nfssvc_sock list when it is finished processing its
2347 * current work. This flag is only cleared when an nfsd can not find
2348 * any new work to perform.
2350 void
2351 nfsrv_wakenfsd(struct nfssvc_sock *slp, int nparallel)
2353 struct nfsd *nd;
2355 if ((slp->ns_flag & SLP_VALID) == 0)
2356 return;
2357 if (nparallel <= 1)
2358 nparallel = 1;
2359 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2360 if (nd->nfsd_flag & NFSD_WAITING) {
2361 nd->nfsd_flag &= ~NFSD_WAITING;
2362 if (nd->nfsd_slp)
2363 panic("nfsd wakeup");
2364 slp->ns_sref++;
2365 nd->nfsd_slp = slp;
2366 wakeup((caddr_t)nd);
2367 if (--nparallel == 0)
2368 break;
2371 if (nparallel) {
2372 slp->ns_flag |= SLP_DOREC;
2373 nfsd_head_flag |= NFSD_CHECKSLP;
2376 #endif /* NFS_NOSERVER */