NFS - Rewrite the RTT code and the request flags & low level state machine
[dragonfly.git] / sys / vfs / nfs / nfs_socket.c
blobaa477befcae69ad2d672494682b4309274e9749c
1 /*
2 * Copyright (c) 1989, 1991, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
36 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
37 * $FreeBSD: src/sys/nfs/nfs_socket.c,v 1.60.2.6 2003/03/26 01:44:46 alfred Exp $
38 * $DragonFly: src/sys/vfs/nfs/nfs_socket.c,v 1.45 2007/05/18 17:05:13 dillon Exp $
42 * Socket operations for use by nfs
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/proc.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
51 #include <sys/mbuf.h>
52 #include <sys/vnode.h>
53 #include <sys/fcntl.h>
54 #include <sys/protosw.h>
55 #include <sys/resourcevar.h>
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/socketops.h>
59 #include <sys/syslog.h>
60 #include <sys/thread.h>
61 #include <sys/tprintf.h>
62 #include <sys/sysctl.h>
63 #include <sys/signalvar.h>
64 #include <sys/mutex.h>
66 #include <sys/signal2.h>
67 #include <sys/mutex2.h>
69 #include <netinet/in.h>
70 #include <netinet/tcp.h>
71 #include <sys/thread2.h>
73 #include "rpcv2.h"
74 #include "nfsproto.h"
75 #include "nfs.h"
76 #include "xdr_subs.h"
77 #include "nfsm_subs.h"
78 #include "nfsmount.h"
79 #include "nfsnode.h"
80 #include "nfsrtt.h"
82 #define TRUE 1
83 #define FALSE 0
86 * RTT calculations are scaled by 256 (8 bits). A proper fractional
87 * RTT will still be calculated even with a slow NFS timer.
89 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum]]
90 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum]]
91 #define NFS_RTT_SCALE_BITS 8 /* bits */
92 #define NFS_RTT_SCALE 256 /* value */
95 * Defines which timer to use for the procnum.
96 * 0 - default
97 * 1 - getattr
98 * 2 - lookup
99 * 3 - read
100 * 4 - write
102 static int proct[NFS_NPROCS] = {
103 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
104 0, 0, 0,
107 static int nfs_backoff[8] = { 2, 3, 5, 8, 13, 21, 34, 55 };
108 static int nfs_realign_test;
109 static int nfs_realign_count;
110 static int nfs_bufpackets = 4;
111 static int nfs_showrtt;
112 static int nfs_showrexmit;
114 SYSCTL_DECL(_vfs_nfs);
116 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
117 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
118 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
119 SYSCTL_INT(_vfs_nfs, OID_AUTO, showrtt, CTLFLAG_RW, &nfs_showrtt, 0, "");
120 SYSCTL_INT(_vfs_nfs, OID_AUTO, showrexmit, CTLFLAG_RW, &nfs_showrexmit, 0, "");
122 static int nfs_request_setup(nfsm_info_t info);
123 static int nfs_request_auth(struct nfsreq *rep);
124 static int nfs_request_try(struct nfsreq *rep);
125 static int nfs_request_waitreply(struct nfsreq *rep);
126 static int nfs_request_processreply(nfsm_info_t info, int);
128 int nfsrtton = 0;
129 struct nfsrtt nfsrtt;
130 struct callout nfs_timer_handle;
132 static int nfs_msg (struct thread *,char *,char *);
133 static int nfs_rcvlock (struct nfsmount *nmp, struct nfsreq *myreq);
134 static void nfs_rcvunlock (struct nfsmount *nmp);
135 static void nfs_realign (struct mbuf **pm, int hsiz);
136 static int nfs_receive (struct nfsmount *nmp, struct nfsreq *rep,
137 struct sockaddr **aname, struct mbuf **mp);
138 static void nfs_softterm (struct nfsreq *rep, int islocked);
139 static void nfs_hardterm (struct nfsreq *rep, int islocked);
140 static int nfs_reconnect (struct nfsmount *nmp, struct nfsreq *rep);
141 #ifndef NFS_NOSERVER
142 static int nfsrv_getstream (struct nfssvc_sock *, int, int *);
143 static void nfs_timer_req(struct nfsreq *req);
145 int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd,
146 struct nfssvc_sock *slp,
147 struct thread *td,
148 struct mbuf **mreqp) = {
149 nfsrv_null,
150 nfsrv_getattr,
151 nfsrv_setattr,
152 nfsrv_lookup,
153 nfsrv3_access,
154 nfsrv_readlink,
155 nfsrv_read,
156 nfsrv_write,
157 nfsrv_create,
158 nfsrv_mkdir,
159 nfsrv_symlink,
160 nfsrv_mknod,
161 nfsrv_remove,
162 nfsrv_rmdir,
163 nfsrv_rename,
164 nfsrv_link,
165 nfsrv_readdir,
166 nfsrv_readdirplus,
167 nfsrv_statfs,
168 nfsrv_fsinfo,
169 nfsrv_pathconf,
170 nfsrv_commit,
171 nfsrv_noop,
172 nfsrv_noop,
173 nfsrv_noop,
174 nfsrv_noop
176 #endif /* NFS_NOSERVER */
179 * Initialize sockets and congestion for a new NFS connection.
180 * We do not free the sockaddr if error.
183 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
185 struct socket *so;
186 int error, rcvreserve, sndreserve;
187 int pktscale;
188 struct sockaddr *saddr;
189 struct sockaddr_in *sin;
190 struct thread *td = &thread0; /* only used for socreate and sobind */
192 nmp->nm_so = NULL;
193 saddr = nmp->nm_nam;
194 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
195 nmp->nm_soproto, td);
196 if (error)
197 goto bad;
198 so = nmp->nm_so;
199 nmp->nm_soflags = so->so_proto->pr_flags;
202 * Some servers require that the client port be a reserved port number.
204 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
205 struct sockopt sopt;
206 int ip;
207 struct sockaddr_in ssin;
209 bzero(&sopt, sizeof sopt);
210 ip = IP_PORTRANGE_LOW;
211 sopt.sopt_level = IPPROTO_IP;
212 sopt.sopt_name = IP_PORTRANGE;
213 sopt.sopt_val = (void *)&ip;
214 sopt.sopt_valsize = sizeof(ip);
215 sopt.sopt_td = NULL;
216 error = sosetopt(so, &sopt);
217 if (error)
218 goto bad;
219 bzero(&ssin, sizeof ssin);
220 sin = &ssin;
221 sin->sin_len = sizeof (struct sockaddr_in);
222 sin->sin_family = AF_INET;
223 sin->sin_addr.s_addr = INADDR_ANY;
224 sin->sin_port = htons(0);
225 error = sobind(so, (struct sockaddr *)sin, td);
226 if (error)
227 goto bad;
228 bzero(&sopt, sizeof sopt);
229 ip = IP_PORTRANGE_DEFAULT;
230 sopt.sopt_level = IPPROTO_IP;
231 sopt.sopt_name = IP_PORTRANGE;
232 sopt.sopt_val = (void *)&ip;
233 sopt.sopt_valsize = sizeof(ip);
234 sopt.sopt_td = NULL;
235 error = sosetopt(so, &sopt);
236 if (error)
237 goto bad;
241 * Protocols that do not require connections may be optionally left
242 * unconnected for servers that reply from a port other than NFS_PORT.
244 if (nmp->nm_flag & NFSMNT_NOCONN) {
245 if (nmp->nm_soflags & PR_CONNREQUIRED) {
246 error = ENOTCONN;
247 goto bad;
249 } else {
250 error = soconnect(so, nmp->nm_nam, td);
251 if (error)
252 goto bad;
255 * Wait for the connection to complete. Cribbed from the
256 * connect system call but with the wait timing out so
257 * that interruptible mounts don't hang here for a long time.
259 crit_enter();
260 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
261 (void) tsleep((caddr_t)&so->so_timeo, 0,
262 "nfscon", 2 * hz);
263 if ((so->so_state & SS_ISCONNECTING) &&
264 so->so_error == 0 && rep &&
265 (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0){
266 so->so_state &= ~SS_ISCONNECTING;
267 crit_exit();
268 goto bad;
271 if (so->so_error) {
272 error = so->so_error;
273 so->so_error = 0;
274 crit_exit();
275 goto bad;
277 crit_exit();
279 so->so_rcv.ssb_timeo = (5 * hz);
280 so->so_snd.ssb_timeo = (5 * hz);
283 * Get buffer reservation size from sysctl, but impose reasonable
284 * limits.
286 pktscale = nfs_bufpackets;
287 if (pktscale < 2)
288 pktscale = 2;
289 if (pktscale > 64)
290 pktscale = 64;
292 if (nmp->nm_sotype == SOCK_DGRAM) {
293 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
294 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
295 NFS_MAXPKTHDR) * pktscale;
296 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
297 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
298 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
299 NFS_MAXPKTHDR) * pktscale;
300 } else {
301 if (nmp->nm_sotype != SOCK_STREAM)
302 panic("nfscon sotype");
303 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
304 struct sockopt sopt;
305 int val;
307 bzero(&sopt, sizeof sopt);
308 sopt.sopt_level = SOL_SOCKET;
309 sopt.sopt_name = SO_KEEPALIVE;
310 sopt.sopt_val = &val;
311 sopt.sopt_valsize = sizeof val;
312 val = 1;
313 sosetopt(so, &sopt);
315 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
316 struct sockopt sopt;
317 int val;
319 bzero(&sopt, sizeof sopt);
320 sopt.sopt_level = IPPROTO_TCP;
321 sopt.sopt_name = TCP_NODELAY;
322 sopt.sopt_val = &val;
323 sopt.sopt_valsize = sizeof val;
324 val = 1;
325 sosetopt(so, &sopt);
327 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
328 sizeof (u_int32_t)) * pktscale;
329 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
330 sizeof (u_int32_t)) * pktscale;
332 error = soreserve(so, sndreserve, rcvreserve,
333 &td->td_proc->p_rlimit[RLIMIT_SBSIZE]);
334 if (error)
335 goto bad;
336 so->so_rcv.ssb_flags |= SSB_NOINTR;
337 so->so_snd.ssb_flags |= SSB_NOINTR;
339 /* Initialize other non-zero congestion variables */
340 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
341 nmp->nm_srtt[3] = (NFS_TIMEO << NFS_RTT_SCALE_BITS);
342 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
343 nmp->nm_sdrtt[3] = 0;
344 nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
345 nmp->nm_timeouts = 0;
346 return (0);
348 bad:
349 nfs_disconnect(nmp);
350 return (error);
354 * Reconnect routine:
355 * Called when a connection is broken on a reliable protocol.
356 * - clean up the old socket
357 * - nfs_connect() again
358 * - set R_NEEDSXMIT for all outstanding requests on mount point
359 * If this fails the mount point is DEAD!
360 * nb: Must be called with the nfs_sndlock() set on the mount point.
362 static int
363 nfs_reconnect(struct nfsmount *nmp, struct nfsreq *rep)
365 struct nfsreq *req;
366 int error;
368 nfs_disconnect(nmp);
369 while ((error = nfs_connect(nmp, rep)) != 0) {
370 if (error == EINTR || error == ERESTART)
371 return (EINTR);
372 (void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0);
376 * Loop through outstanding request list and fix up all requests
377 * on old socket.
379 crit_enter();
380 TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
381 KKASSERT(req->r_nmp == nmp);
382 req->r_flags |= R_NEEDSXMIT;
384 crit_exit();
385 return (0);
389 * NFS disconnect. Clean up and unlink.
391 void
392 nfs_disconnect(struct nfsmount *nmp)
394 struct socket *so;
396 if (nmp->nm_so) {
397 so = nmp->nm_so;
398 nmp->nm_so = NULL;
399 soshutdown(so, SHUT_RDWR);
400 soclose(so, FNONBLOCK);
404 void
405 nfs_safedisconnect(struct nfsmount *nmp)
407 nfs_rcvlock(nmp, NULL);
408 nfs_disconnect(nmp);
409 nfs_rcvunlock(nmp);
413 * This is the nfs send routine. For connection based socket types, it
414 * must be called with an nfs_sndlock() on the socket.
415 * "rep == NULL" indicates that it has been called from a server.
416 * For the client side:
417 * - return EINTR if the RPC is terminated, 0 otherwise
418 * - set R_NEEDSXMIT if the send fails for any reason
419 * - do any cleanup required by recoverable socket errors (?)
420 * For the server side:
421 * - return EINTR or ERESTART if interrupted by a signal
422 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
423 * - do any cleanup required by recoverable socket errors (?)
426 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
427 struct nfsreq *rep)
429 struct sockaddr *sendnam;
430 int error, soflags, flags;
432 if (rep) {
433 if (rep->r_flags & R_SOFTTERM) {
434 m_freem(top);
435 return (EINTR);
437 if ((so = rep->r_nmp->nm_so) == NULL) {
438 rep->r_flags |= R_NEEDSXMIT;
439 m_freem(top);
440 return (0);
442 rep->r_flags &= ~R_NEEDSXMIT;
443 soflags = rep->r_nmp->nm_soflags;
444 } else {
445 soflags = so->so_proto->pr_flags;
447 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
448 sendnam = NULL;
449 else
450 sendnam = nam;
451 if (so->so_type == SOCK_SEQPACKET)
452 flags = MSG_EOR;
453 else
454 flags = 0;
456 error = so_pru_sosend(so, sendnam, NULL, top, NULL, flags,
457 curthread /*XXX*/);
459 * ENOBUFS for dgram sockets is transient and non fatal.
460 * No need to log, and no need to break a soft mount.
462 if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
463 error = 0;
465 * do backoff retransmit on client
467 if (rep)
468 rep->r_flags |= R_NEEDSXMIT;
471 if (error) {
472 if (rep) {
473 log(LOG_INFO, "nfs send error %d for server %s\n",error,
474 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
476 * Deal with errors for the client side.
478 if (rep->r_flags & R_SOFTTERM)
479 error = EINTR;
480 else
481 rep->r_flags |= R_NEEDSXMIT;
482 } else {
483 log(LOG_INFO, "nfsd send error %d\n", error);
487 * Handle any recoverable (soft) socket errors here. (?)
489 if (error != EINTR && error != ERESTART &&
490 error != EWOULDBLOCK && error != EPIPE)
491 error = 0;
493 return (error);
497 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
498 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
499 * Mark and consolidate the data into a new mbuf list.
500 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
501 * small mbufs.
502 * For SOCK_STREAM we must be very careful to read an entire record once
503 * we have read any of it, even if the system call has been interrupted.
505 static int
506 nfs_receive(struct nfsmount *nmp, struct nfsreq *rep,
507 struct sockaddr **aname, struct mbuf **mp)
509 struct socket *so;
510 struct sockbuf sio;
511 struct uio auio;
512 struct iovec aio;
513 struct mbuf *m;
514 struct mbuf *control;
515 u_int32_t len;
516 struct sockaddr **getnam;
517 int error, sotype, rcvflg;
518 struct thread *td = curthread; /* XXX */
521 * Set up arguments for soreceive()
523 *mp = NULL;
524 *aname = NULL;
525 sotype = nmp->nm_sotype;
528 * For reliable protocols, lock against other senders/receivers
529 * in case a reconnect is necessary.
530 * For SOCK_STREAM, first get the Record Mark to find out how much
531 * more there is to get.
532 * We must lock the socket against other receivers
533 * until we have an entire rpc request/reply.
535 if (sotype != SOCK_DGRAM) {
536 error = nfs_sndlock(nmp, rep);
537 if (error)
538 return (error);
539 tryagain:
541 * Check for fatal errors and resending request.
544 * Ugh: If a reconnect attempt just happened, nm_so
545 * would have changed. NULL indicates a failed
546 * attempt that has essentially shut down this
547 * mount point.
549 if (rep && (rep->r_mrep || (rep->r_flags & R_SOFTTERM))) {
550 nfs_sndunlock(nmp);
551 return (EINTR);
553 so = nmp->nm_so;
554 if (so == NULL) {
555 error = nfs_reconnect(nmp, rep);
556 if (error) {
557 nfs_sndunlock(nmp);
558 return (error);
560 goto tryagain;
562 while (rep && (rep->r_flags & R_NEEDSXMIT)) {
563 m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
564 nfsstats.rpcretries++;
565 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
566 if (error) {
567 if (error == EINTR || error == ERESTART ||
568 (error = nfs_reconnect(nmp, rep)) != 0) {
569 nfs_sndunlock(nmp);
570 return (error);
572 goto tryagain;
575 nfs_sndunlock(nmp);
576 if (sotype == SOCK_STREAM) {
578 * Get the length marker from the stream
580 aio.iov_base = (caddr_t)&len;
581 aio.iov_len = sizeof(u_int32_t);
582 auio.uio_iov = &aio;
583 auio.uio_iovcnt = 1;
584 auio.uio_segflg = UIO_SYSSPACE;
585 auio.uio_rw = UIO_READ;
586 auio.uio_offset = 0;
587 auio.uio_resid = sizeof(u_int32_t);
588 auio.uio_td = td;
589 do {
590 rcvflg = MSG_WAITALL;
591 error = so_pru_soreceive(so, NULL, &auio, NULL,
592 NULL, &rcvflg);
593 if (error == EWOULDBLOCK && rep) {
594 if (rep->r_flags & R_SOFTTERM)
595 return (EINTR);
597 } while (error == EWOULDBLOCK);
599 if (error == 0 && auio.uio_resid > 0) {
601 * Only log short packets if not EOF
603 if (auio.uio_resid != sizeof(u_int32_t))
604 log(LOG_INFO,
605 "short receive (%d/%d) from nfs server %s\n",
606 (int)(sizeof(u_int32_t) - auio.uio_resid),
607 (int)sizeof(u_int32_t),
608 nmp->nm_mountp->mnt_stat.f_mntfromname);
609 error = EPIPE;
611 if (error)
612 goto errout;
613 len = ntohl(len) & ~0x80000000;
615 * This is SERIOUS! We are out of sync with the sender
616 * and forcing a disconnect/reconnect is all I can do.
618 if (len > NFS_MAXPACKET) {
619 log(LOG_ERR, "%s (%d) from nfs server %s\n",
620 "impossible packet length",
621 len,
622 nmp->nm_mountp->mnt_stat.f_mntfromname);
623 error = EFBIG;
624 goto errout;
628 * Get the rest of the packet as an mbuf chain
630 sbinit(&sio, len);
631 do {
632 rcvflg = MSG_WAITALL;
633 error = so_pru_soreceive(so, NULL, NULL, &sio,
634 NULL, &rcvflg);
635 } while (error == EWOULDBLOCK || error == EINTR ||
636 error == ERESTART);
637 if (error == 0 && sio.sb_cc != len) {
638 if (sio.sb_cc != 0)
639 log(LOG_INFO,
640 "short receive (%d/%d) from nfs server %s\n",
641 len - auio.uio_resid, len,
642 nmp->nm_mountp->mnt_stat.f_mntfromname);
643 error = EPIPE;
645 *mp = sio.sb_mb;
646 } else {
648 * Non-stream, so get the whole packet by not
649 * specifying MSG_WAITALL and by specifying a large
650 * length.
652 * We have no use for control msg., but must grab them
653 * and then throw them away so we know what is going
654 * on.
656 sbinit(&sio, 100000000);
657 do {
658 rcvflg = 0;
659 error = so_pru_soreceive(so, NULL, NULL, &sio,
660 &control, &rcvflg);
661 if (control)
662 m_freem(control);
663 if (error == EWOULDBLOCK && rep) {
664 if (rep->r_flags & R_SOFTTERM) {
665 m_freem(sio.sb_mb);
666 return (EINTR);
669 } while (error == EWOULDBLOCK ||
670 (error == 0 && sio.sb_mb == NULL && control));
671 if ((rcvflg & MSG_EOR) == 0)
672 kprintf("Egad!!\n");
673 if (error == 0 && sio.sb_mb == NULL)
674 error = EPIPE;
675 len = sio.sb_cc;
676 *mp = sio.sb_mb;
678 errout:
679 if (error && error != EINTR && error != ERESTART) {
680 m_freem(*mp);
681 *mp = NULL;
682 if (error != EPIPE) {
683 log(LOG_INFO,
684 "receive error %d from nfs server %s\n",
685 error,
686 nmp->nm_mountp->mnt_stat.f_mntfromname);
688 error = nfs_sndlock(nmp, rep);
689 if (!error) {
690 error = nfs_reconnect(nmp, rep);
691 if (!error)
692 goto tryagain;
693 else
694 nfs_sndunlock(nmp);
697 } else {
698 if ((so = nmp->nm_so) == NULL)
699 return (EACCES);
700 if (so->so_state & SS_ISCONNECTED)
701 getnam = NULL;
702 else
703 getnam = aname;
704 sbinit(&sio, 100000000);
705 do {
706 rcvflg = 0;
707 error = so_pru_soreceive(so, getnam, NULL, &sio,
708 NULL, &rcvflg);
709 if (error == EWOULDBLOCK && rep &&
710 (rep->r_flags & R_SOFTTERM)) {
711 m_freem(sio.sb_mb);
712 return (EINTR);
714 } while (error == EWOULDBLOCK);
715 len = sio.sb_cc;
716 *mp = sio.sb_mb;
718 if (error) {
719 m_freem(*mp);
720 *mp = NULL;
723 * Search for any mbufs that are not a multiple of 4 bytes long
724 * or with m_data not longword aligned.
725 * These could cause pointer alignment problems, so copy them to
726 * well aligned mbufs.
728 nfs_realign(mp, 5 * NFSX_UNSIGNED);
729 return (error);
733 * Implement receipt of reply on a socket.
735 * We must search through the list of received datagrams matching them
736 * with outstanding requests using the xid, until ours is found.
738 * If myrep is NULL we process packets on the socket until
739 * interrupted or until nm_reqrxq is non-empty.
741 /* ARGSUSED */
743 nfs_reply(struct nfsmount *nmp, struct nfsreq *myrep)
745 struct nfsreq *rep;
746 struct sockaddr *nam;
747 u_int32_t rxid;
748 u_int32_t *tl;
749 int error;
750 struct nfsm_info info;
753 * Loop around until we get our own reply
755 for (;;) {
757 * Lock against other receivers so that I don't get stuck in
758 * sbwait() after someone else has received my reply for me.
759 * Also necessary for connection based protocols to avoid
760 * race conditions during a reconnect.
762 * If nfs_rcvlock() returns EALREADY, that means that
763 * the reply has already been recieved by another
764 * process and we can return immediately. In this
765 * case, the lock is not taken to avoid races with
766 * other processes.
768 info.mrep = NULL;
770 error = nfs_rcvlock(nmp, myrep);
771 if (error == EALREADY)
772 return (0);
773 if (error)
774 return (error);
777 * If myrep is NULL we are the receiver helper thread.
778 * Stop waiting for incoming replies if there are
779 * messages sitting on reqrxq that we need to process,
780 * or if a shutdown request is pending.
782 if (myrep == NULL && (TAILQ_FIRST(&nmp->nm_reqrxq) ||
783 nmp->nm_rxstate > NFSSVC_PENDING)) {
784 nfs_rcvunlock(nmp);
785 return(EWOULDBLOCK);
789 * Get the next Rpc reply off the socket
791 * We cannot release the receive lock until we've
792 * filled in rep->r_mrep, otherwise a waiting
793 * thread may deadlock in soreceive with no incoming
794 * packets expected.
796 error = nfs_receive(nmp, myrep, &nam, &info.mrep);
797 if (error) {
799 * Ignore routing errors on connectionless protocols??
801 nfs_rcvunlock(nmp);
802 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
803 if (nmp->nm_so == NULL)
804 return (error);
805 nmp->nm_so->so_error = 0;
806 continue;
808 return (error);
810 if (nam)
811 FREE(nam, M_SONAME);
814 * Get the xid and check that it is an rpc reply
816 info.md = info.mrep;
817 info.dpos = mtod(info.md, caddr_t);
818 NULLOUT(tl = nfsm_dissect(&info, 2*NFSX_UNSIGNED));
819 rxid = *tl++;
820 if (*tl != rpc_reply) {
821 nfsstats.rpcinvalid++;
822 m_freem(info.mrep);
823 info.mrep = NULL;
824 nfsmout:
825 nfs_rcvunlock(nmp);
826 continue;
830 * Loop through the request list to match up the reply
831 * Iff no match, just drop the datagram. On match, set
832 * r_mrep atomically to prevent the timer from messing
833 * around with the request after we have exited the critical
834 * section.
836 crit_enter();
837 TAILQ_FOREACH(rep, &nmp->nm_reqq, r_chain) {
838 if (rep->r_mrep == NULL && rxid == rep->r_xid)
839 break;
843 * Fill in the rest of the reply if we found a match.
845 if (rep) {
846 rep->r_md = info.md;
847 rep->r_dpos = info.dpos;
848 if (nfsrtton) {
849 struct rttl *rt;
851 rt = &nfsrtt.rttl[nfsrtt.pos];
852 rt->proc = rep->r_procnum;
853 rt->rto = 0;
854 rt->sent = 0;
855 rt->cwnd = nmp->nm_maxasync_scaled;
856 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
857 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
858 rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid;
859 getmicrotime(&rt->tstamp);
860 if (rep->r_flags & R_TIMING)
861 rt->rtt = rep->r_rtt;
862 else
863 rt->rtt = 1000000;
864 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
868 * New congestion control is based only on async
869 * requests.
871 if (nmp->nm_maxasync_scaled < NFS_MAXASYNC_SCALED)
872 ++nmp->nm_maxasync_scaled;
873 if (rep->r_flags & R_SENT) {
874 rep->r_flags &= ~R_SENT;
877 * Update rtt using a gain of 0.125 on the mean
878 * and a gain of 0.25 on the deviation.
880 * NOTE SRTT/SDRTT are only good if R_TIMING is set.
882 if (rep->r_flags & R_TIMING) {
884 * Since the timer resolution of
885 * NFS_HZ is so course, it can often
886 * result in r_rtt == 0. Since
887 * r_rtt == N means that the actual
888 * rtt is between N+dt and N+2-dt ticks,
889 * add 1.
891 int n;
892 int d;
894 #define NFSRSB NFS_RTT_SCALE_BITS
895 n = ((NFS_SRTT(rep) * 7) +
896 (rep->r_rtt << NFSRSB)) >> 3;
897 d = n - NFS_SRTT(rep);
898 NFS_SRTT(rep) = n;
901 * Don't let the jitter calculation decay
902 * too quickly, but we want a fast rampup.
904 if (d < 0)
905 d = -d;
906 d <<= NFSRSB;
907 if (d < NFS_SDRTT(rep))
908 n = ((NFS_SDRTT(rep) * 15) + d) >> 4;
909 else
910 n = ((NFS_SDRTT(rep) * 3) + d) >> 2;
911 NFS_SDRTT(rep) = n;
912 #undef NFSRSB
914 nmp->nm_timeouts = 0;
915 rep->r_mrep = info.mrep;
916 nfs_hardterm(rep, 0);
918 nfs_rcvunlock(nmp);
919 crit_exit();
922 * If not matched to a request, drop it.
923 * If it's mine, get out.
925 if (rep == NULL) {
926 nfsstats.rpcunexpected++;
927 m_freem(info.mrep);
928 info.mrep = NULL;
929 } else if (rep == myrep) {
930 if (rep->r_mrep == NULL)
931 panic("nfsreply nil");
932 return (0);
938 * Run the request state machine until the target state is reached
939 * or a fatal error occurs. The target state is not run. Specifying
940 * a target of NFSM_STATE_DONE runs the state machine until the rpc
941 * is complete.
943 * EINPROGRESS is returned for all states other then the DONE state,
944 * indicating that the rpc is still in progress.
947 nfs_request(struct nfsm_info *info, nfsm_state_t bstate, nfsm_state_t estate)
949 struct nfsreq *req;
951 while (info->state >= bstate && info->state < estate) {
952 switch(info->state) {
953 case NFSM_STATE_SETUP:
955 * Setup the nfsreq. Any error which occurs during
956 * this state is fatal.
958 info->error = nfs_request_setup(info);
959 if (info->error) {
960 info->state = NFSM_STATE_DONE;
961 return (info->error);
962 } else {
963 req = info->req;
964 req->r_mrp = &info->mrep;
965 req->r_mdp = &info->md;
966 req->r_dposp = &info->dpos;
967 info->state = NFSM_STATE_AUTH;
969 break;
970 case NFSM_STATE_AUTH:
972 * Authenticate the nfsreq. Any error which occurs
973 * during this state is fatal.
975 info->error = nfs_request_auth(info->req);
976 if (info->error) {
977 info->state = NFSM_STATE_DONE;
978 return (info->error);
979 } else {
980 info->state = NFSM_STATE_TRY;
982 break;
983 case NFSM_STATE_TRY:
985 * Transmit or retransmit attempt. An error in this
986 * state is ignored and we always move on to the
987 * next state.
989 * This can trivially race the receiver if the
990 * request is asynchronous. nfs_request_try()
991 * will thus set the state for us and we
992 * must also return immediately if we are
993 * running an async state machine, because
994 * info can become invalid due to races after
995 * try() returns.
997 if (info->req->r_flags & R_ASYNC) {
998 nfs_request_try(info->req);
999 if (estate == NFSM_STATE_WAITREPLY)
1000 return (EINPROGRESS);
1001 } else {
1002 nfs_request_try(info->req);
1003 info->state = NFSM_STATE_WAITREPLY;
1005 break;
1006 case NFSM_STATE_WAITREPLY:
1008 * Wait for a reply or timeout and move on to the
1009 * next state. The error returned by this state
1010 * is passed to the processing code in the next
1011 * state.
1013 info->error = nfs_request_waitreply(info->req);
1014 info->state = NFSM_STATE_PROCESSREPLY;
1015 break;
1016 case NFSM_STATE_PROCESSREPLY:
1018 * Process the reply or timeout. Errors which occur
1019 * in this state may cause the state machine to
1020 * go back to an earlier state, and are fatal
1021 * otherwise.
1023 info->error = nfs_request_processreply(info,
1024 info->error);
1025 switch(info->error) {
1026 case ENEEDAUTH:
1027 info->state = NFSM_STATE_AUTH;
1028 break;
1029 case EAGAIN:
1030 info->state = NFSM_STATE_TRY;
1031 break;
1032 default:
1034 * Operation complete, with or without an
1035 * error. We are done.
1037 info->req = NULL;
1038 info->state = NFSM_STATE_DONE;
1039 return (info->error);
1041 break;
1042 case NFSM_STATE_DONE:
1044 * Shouldn't be reached
1046 return (info->error);
1047 /* NOT REACHED */
1052 * If we are done return the error code (if any).
1053 * Otherwise return EINPROGRESS.
1055 if (info->state == NFSM_STATE_DONE)
1056 return (info->error);
1057 return (EINPROGRESS);
1061 * nfs_request - goes something like this
1062 * - fill in request struct
1063 * - links it into list
1064 * - calls nfs_send() for first transmit
1065 * - calls nfs_receive() to get reply
1066 * - break down rpc header and return with nfs reply pointed to
1067 * by mrep or error
1068 * nb: always frees up mreq mbuf list
1070 static int
1071 nfs_request_setup(nfsm_info_t info)
1073 struct nfsreq *req;
1074 struct nfsmount *nmp;
1075 struct mbuf *m;
1076 int i;
1079 * Reject requests while attempting a forced unmount.
1081 if (info->vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
1082 m_freem(info->mreq);
1083 info->mreq = NULL;
1084 return (ESTALE);
1086 nmp = VFSTONFS(info->vp->v_mount);
1087 req = kmalloc(sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
1088 req->r_nmp = nmp;
1089 req->r_vp = info->vp;
1090 req->r_td = info->td;
1091 req->r_procnum = info->procnum;
1092 req->r_mreq = NULL;
1093 req->r_cred = info->cred;
1095 i = 0;
1096 m = info->mreq;
1097 while (m) {
1098 i += m->m_len;
1099 m = m->m_next;
1101 req->r_mrest = info->mreq;
1102 req->r_mrest_len = i;
1105 * The presence of a non-NULL r_info in req indicates
1106 * async completion via our helper threads. See the receiver
1107 * code.
1109 if (info->bio) {
1110 req->r_info = info;
1111 req->r_flags = R_ASYNC;
1112 } else {
1113 req->r_info = NULL;
1114 req->r_flags = 0;
1116 info->req = req;
1117 return(0);
1120 static int
1121 nfs_request_auth(struct nfsreq *rep)
1123 struct nfsmount *nmp = rep->r_nmp;
1124 struct mbuf *m;
1125 char nickv[RPCX_NICKVERF];
1126 int error = 0, auth_len, auth_type;
1127 int verf_len;
1128 u_int32_t xid;
1129 char *auth_str, *verf_str;
1130 struct ucred *cred;
1132 cred = rep->r_cred;
1133 rep->r_failed_auth = 0;
1136 * Get the RPC header with authorization.
1138 verf_str = auth_str = NULL;
1139 if (nmp->nm_flag & NFSMNT_KERB) {
1140 verf_str = nickv;
1141 verf_len = sizeof (nickv);
1142 auth_type = RPCAUTH_KERB4;
1143 bzero((caddr_t)rep->r_key, sizeof(rep->r_key));
1144 if (rep->r_failed_auth ||
1145 nfs_getnickauth(nmp, cred, &auth_str, &auth_len,
1146 verf_str, verf_len)) {
1147 error = nfs_getauth(nmp, rep, cred, &auth_str,
1148 &auth_len, verf_str, &verf_len, rep->r_key);
1149 if (error) {
1150 m_freem(rep->r_mrest);
1151 rep->r_mrest = NULL;
1152 kfree((caddr_t)rep, M_NFSREQ);
1153 return (error);
1156 } else {
1157 auth_type = RPCAUTH_UNIX;
1158 if (cred->cr_ngroups < 1)
1159 panic("nfsreq nogrps");
1160 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1161 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1162 5 * NFSX_UNSIGNED;
1164 m = nfsm_rpchead(cred, nmp->nm_flag, rep->r_procnum, auth_type,
1165 auth_len, auth_str, verf_len, verf_str,
1166 rep->r_mrest, rep->r_mrest_len, &rep->r_mheadend, &xid);
1167 rep->r_mrest = NULL;
1168 if (auth_str)
1169 kfree(auth_str, M_TEMP);
1172 * For stream protocols, insert a Sun RPC Record Mark.
1174 if (nmp->nm_sotype == SOCK_STREAM) {
1175 M_PREPEND(m, NFSX_UNSIGNED, MB_WAIT);
1176 if (m == NULL) {
1177 kfree(rep, M_NFSREQ);
1178 return (ENOBUFS);
1180 *mtod(m, u_int32_t *) = htonl(0x80000000 |
1181 (m->m_pkthdr.len - NFSX_UNSIGNED));
1183 rep->r_mreq = m;
1184 rep->r_xid = xid;
1185 return (0);
1188 static int
1189 nfs_request_try(struct nfsreq *rep)
1191 struct nfsmount *nmp = rep->r_nmp;
1192 struct mbuf *m2;
1193 int error;
1196 * Request is not on any queue, only the owner has access to it
1197 * so it should not be locked by anyone atm.
1199 * Interlock to prevent races. While locked the only remote
1200 * action possible is for r_mrep to be set (once we enqueue it).
1202 if (rep->r_flags == 0xdeadc0de) {
1203 print_backtrace();
1204 panic("flags nbad\n");
1206 KKASSERT((rep->r_flags & (R_LOCKED | R_ONREQQ)) == 0);
1207 if (nmp->nm_flag & NFSMNT_SOFT)
1208 rep->r_retry = nmp->nm_retry;
1209 else
1210 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
1211 rep->r_rtt = rep->r_rexmit = 0;
1212 if (proct[rep->r_procnum] > 0)
1213 rep->r_flags |= R_TIMING | R_LOCKED;
1214 else
1215 rep->r_flags |= R_LOCKED;
1216 rep->r_mrep = NULL;
1219 * Do the client side RPC.
1221 nfsstats.rpcrequests++;
1224 * Chain request into list of outstanding requests. Be sure
1225 * to put it LAST so timer finds oldest requests first. Note
1226 * that our control of R_LOCKED prevents the request from
1227 * getting ripped out from under us or transmitted by the
1228 * timer code.
1230 * For requests with info structures we must atomically set the
1231 * info's state because the structure could become invalid upon
1232 * return due to races (i.e., if async)
1234 crit_enter();
1235 mtx_link_init(&rep->r_link);
1236 TAILQ_INSERT_TAIL(&nmp->nm_reqq, rep, r_chain);
1237 rep->r_flags |= R_ONREQQ;
1238 ++nmp->nm_reqqlen;
1239 if (rep->r_flags & R_ASYNC)
1240 rep->r_info->state = NFSM_STATE_WAITREPLY;
1241 crit_exit();
1243 error = 0;
1246 * Send if we can. Congestion control is not handled here any more
1247 * becausing trying to defer the initial send based on the nfs_timer
1248 * requires having a very fast nfs_timer, which is silly.
1250 if (nmp->nm_so) {
1251 if (nmp->nm_soflags & PR_CONNREQUIRED)
1252 error = nfs_sndlock(nmp, rep);
1253 if (error == 0) {
1254 m2 = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
1255 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1256 if (nmp->nm_soflags & PR_CONNREQUIRED)
1257 nfs_sndunlock(nmp);
1258 rep->r_flags &= ~R_NEEDSXMIT;
1259 if ((rep->r_flags & R_SENT) == 0) {
1260 rep->r_flags |= R_SENT;
1262 } else {
1263 rep->r_flags |= R_NEEDSXMIT;
1265 } else {
1266 rep->r_flags |= R_NEEDSXMIT;
1267 rep->r_rtt = -1;
1269 if (error == EPIPE)
1270 error = 0;
1273 * Release the lock. The only remote action that may have occurred
1274 * would have been the setting of rep->r_mrep. If this occured
1275 * and the request was async we have to move it to the reader
1276 * thread's queue for action.
1278 * For async requests also make sure the reader is woken up so
1279 * it gets on the socket to read responses.
1281 crit_enter();
1282 if (rep->r_flags & R_ASYNC) {
1283 if (rep->r_mrep)
1284 nfs_hardterm(rep, 1);
1285 rep->r_flags &= ~R_LOCKED;
1286 nfssvc_iod_reader_wakeup(nmp);
1287 } else {
1288 rep->r_flags &= ~R_LOCKED;
1290 if (rep->r_flags & R_WANTED) {
1291 rep->r_flags &= ~R_WANTED;
1292 wakeup(rep);
1294 crit_exit();
1295 return (error);
1299 * This code is only called for synchronous requests. Completed synchronous
1300 * requests are left on reqq and we remove them before moving on to the
1301 * processing state.
1303 static int
1304 nfs_request_waitreply(struct nfsreq *rep)
1306 struct nfsmount *nmp = rep->r_nmp;
1307 int error;
1309 KKASSERT((rep->r_flags & R_ASYNC) == 0);
1312 * Wait until the request is finished.
1314 error = nfs_reply(nmp, rep);
1317 * RPC done, unlink the request, but don't rip it out from under
1318 * the callout timer.
1320 * Once unlinked no other receiver or the timer will have
1321 * visibility, so we do not have to set R_LOCKED.
1323 crit_enter();
1324 while (rep->r_flags & R_LOCKED) {
1325 rep->r_flags |= R_WANTED;
1326 tsleep(rep, 0, "nfstrac", 0);
1328 KKASSERT(rep->r_flags & R_ONREQQ);
1329 TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain);
1330 rep->r_flags &= ~R_ONREQQ;
1331 --nmp->nm_reqqlen;
1332 crit_exit();
1335 * Decrement the outstanding request count.
1337 if (rep->r_flags & R_SENT) {
1338 rep->r_flags &= ~R_SENT;
1340 return (error);
1344 * Process reply with error returned from nfs_requet_waitreply().
1346 * Returns EAGAIN if it wants us to loop up to nfs_request_try() again.
1347 * Returns ENEEDAUTH if it wants us to loop up to nfs_request_auth() again.
1349 static int
1350 nfs_request_processreply(nfsm_info_t info, int error)
1352 struct nfsreq *req = info->req;
1353 struct nfsmount *nmp = req->r_nmp;
1354 u_int32_t *tl;
1355 int verf_type;
1356 int i;
1359 * If there was a successful reply and a tprintf msg.
1360 * tprintf a response.
1362 if (error == 0 && (req->r_flags & R_TPRINTFMSG)) {
1363 nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
1364 "is alive again");
1366 info->mrep = req->r_mrep;
1367 info->md = req->r_md;
1368 info->dpos = req->r_dpos;
1369 if (error) {
1370 m_freem(req->r_mreq);
1371 req->r_mreq = NULL;
1372 kfree(req, M_NFSREQ);
1373 info->req = NULL;
1374 return (error);
1378 * break down the rpc header and check if ok
1380 NULLOUT(tl = nfsm_dissect(info, 3 * NFSX_UNSIGNED));
1381 if (*tl++ == rpc_msgdenied) {
1382 if (*tl == rpc_mismatch) {
1383 error = EOPNOTSUPP;
1384 } else if ((nmp->nm_flag & NFSMNT_KERB) &&
1385 *tl++ == rpc_autherr) {
1386 if (req->r_failed_auth == 0) {
1387 req->r_failed_auth++;
1388 req->r_mheadend->m_next = NULL;
1389 m_freem(info->mrep);
1390 info->mrep = NULL;
1391 m_freem(req->r_mreq);
1392 return (ENEEDAUTH);
1393 } else {
1394 error = EAUTH;
1396 } else {
1397 error = EACCES;
1399 m_freem(info->mrep);
1400 info->mrep = NULL;
1401 m_freem(req->r_mreq);
1402 req->r_mreq = NULL;
1403 kfree(req, M_NFSREQ);
1404 info->req = NULL;
1405 return (error);
1409 * Grab any Kerberos verifier, otherwise just throw it away.
1411 verf_type = fxdr_unsigned(int, *tl++);
1412 i = fxdr_unsigned(int32_t, *tl);
1413 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1414 error = nfs_savenickauth(nmp, req->r_cred, i, req->r_key,
1415 &info->md, &info->dpos, info->mrep);
1416 if (error)
1417 goto nfsmout;
1418 } else if (i > 0) {
1419 ERROROUT(nfsm_adv(info, nfsm_rndup(i)));
1421 NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED));
1422 /* 0 == ok */
1423 if (*tl == 0) {
1424 NULLOUT(tl = nfsm_dissect(info, NFSX_UNSIGNED));
1425 if (*tl != 0) {
1426 error = fxdr_unsigned(int, *tl);
1429 * Does anyone even implement this? Just impose
1430 * a 1-second delay.
1432 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1433 error == NFSERR_TRYLATER) {
1434 m_freem(info->mrep);
1435 info->mrep = NULL;
1436 error = 0;
1438 tsleep((caddr_t)&lbolt, 0, "nqnfstry", 0);
1439 return (EAGAIN); /* goto tryagain */
1443 * If the File Handle was stale, invalidate the
1444 * lookup cache, just in case.
1446 * To avoid namecache<->vnode deadlocks we must
1447 * release the vnode lock if we hold it.
1449 if (error == ESTALE) {
1450 struct vnode *vp = req->r_vp;
1451 int ltype;
1453 ltype = lockstatus(&vp->v_lock, curthread);
1454 if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED)
1455 lockmgr(&vp->v_lock, LK_RELEASE);
1456 cache_inval_vp(vp, CINV_CHILDREN);
1457 if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED)
1458 lockmgr(&vp->v_lock, ltype);
1460 if (nmp->nm_flag & NFSMNT_NFSV3) {
1461 KKASSERT(*req->r_mrp == info->mrep);
1462 KKASSERT(*req->r_mdp == info->md);
1463 KKASSERT(*req->r_dposp == info->dpos);
1464 error |= NFSERR_RETERR;
1465 } else {
1466 m_freem(info->mrep);
1467 info->mrep = NULL;
1469 m_freem(req->r_mreq);
1470 req->r_mreq = NULL;
1471 kfree(req, M_NFSREQ);
1472 info->req = NULL;
1473 return (error);
1476 KKASSERT(*req->r_mrp == info->mrep);
1477 KKASSERT(*req->r_mdp == info->md);
1478 KKASSERT(*req->r_dposp == info->dpos);
1479 m_freem(req->r_mreq);
1480 req->r_mreq = NULL;
1481 FREE(req, M_NFSREQ);
1482 return (0);
1484 m_freem(info->mrep);
1485 info->mrep = NULL;
1486 error = EPROTONOSUPPORT;
1487 nfsmout:
1488 m_freem(req->r_mreq);
1489 req->r_mreq = NULL;
1490 kfree(req, M_NFSREQ);
1491 info->req = NULL;
1492 return (error);
1495 #ifndef NFS_NOSERVER
1497 * Generate the rpc reply header
1498 * siz arg. is used to decide if adding a cluster is worthwhile
1501 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
1502 int err, struct mbuf **mrq, struct mbuf **mbp, caddr_t *bposp)
1504 u_int32_t *tl;
1505 struct nfsm_info info;
1507 siz += RPC_REPLYSIZ;
1508 info.mb = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL);
1509 info.mreq = info.mb;
1510 info.mreq->m_pkthdr.len = 0;
1512 * If this is not a cluster, try and leave leading space
1513 * for the lower level headers.
1515 if ((max_hdr + siz) < MINCLSIZE)
1516 info.mreq->m_data += max_hdr;
1517 tl = mtod(info.mreq, u_int32_t *);
1518 info.mreq->m_len = 6 * NFSX_UNSIGNED;
1519 info.bpos = ((caddr_t)tl) + info.mreq->m_len;
1520 *tl++ = txdr_unsigned(nd->nd_retxid);
1521 *tl++ = rpc_reply;
1522 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1523 *tl++ = rpc_msgdenied;
1524 if (err & NFSERR_AUTHERR) {
1525 *tl++ = rpc_autherr;
1526 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1527 info.mreq->m_len -= NFSX_UNSIGNED;
1528 info.bpos -= NFSX_UNSIGNED;
1529 } else {
1530 *tl++ = rpc_mismatch;
1531 *tl++ = txdr_unsigned(RPC_VER2);
1532 *tl = txdr_unsigned(RPC_VER2);
1534 } else {
1535 *tl++ = rpc_msgaccepted;
1538 * For Kerberos authentication, we must send the nickname
1539 * verifier back, otherwise just RPCAUTH_NULL.
1541 if (nd->nd_flag & ND_KERBFULL) {
1542 struct nfsuid *nuidp;
1543 struct timeval ktvin, ktvout;
1545 for (nuidp = NUIDHASH(slp, nd->nd_cr.cr_uid)->lh_first;
1546 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1547 if (nuidp->nu_cr.cr_uid == nd->nd_cr.cr_uid &&
1548 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1549 &nuidp->nu_haddr, nd->nd_nam2)))
1550 break;
1552 if (nuidp) {
1553 ktvin.tv_sec =
1554 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1555 ktvin.tv_usec =
1556 txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1559 * Encrypt the timestamp in ecb mode using the
1560 * session key.
1562 #ifdef NFSKERB
1564 #endif
1566 *tl++ = rpc_auth_kerb;
1567 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1568 *tl = ktvout.tv_sec;
1569 tl = nfsm_build(&info, 3 * NFSX_UNSIGNED);
1570 *tl++ = ktvout.tv_usec;
1571 *tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid);
1572 } else {
1573 *tl++ = 0;
1574 *tl++ = 0;
1576 } else {
1577 *tl++ = 0;
1578 *tl++ = 0;
1580 switch (err) {
1581 case EPROGUNAVAIL:
1582 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1583 break;
1584 case EPROGMISMATCH:
1585 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1586 tl = nfsm_build(&info, 2 * NFSX_UNSIGNED);
1587 *tl++ = txdr_unsigned(2);
1588 *tl = txdr_unsigned(3);
1589 break;
1590 case EPROCUNAVAIL:
1591 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1592 break;
1593 case EBADRPC:
1594 *tl = txdr_unsigned(RPC_GARBAGE);
1595 break;
1596 default:
1597 *tl = 0;
1598 if (err != NFSERR_RETVOID) {
1599 tl = nfsm_build(&info, NFSX_UNSIGNED);
1600 if (err)
1601 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1602 else
1603 *tl = 0;
1605 break;
1609 if (mrq != NULL)
1610 *mrq = info.mreq;
1611 *mbp = info.mb;
1612 *bposp = info.bpos;
1613 if (err != 0 && err != NFSERR_RETVOID)
1614 nfsstats.srvrpc_errs++;
1615 return (0);
1619 #endif /* NFS_NOSERVER */
1622 * Nfs timer routine.
1624 * Scan the nfsreq list and retranmit any requests that have timed out
1625 * To avoid retransmission attempts on STREAM sockets (in the future) make
1626 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1628 * Requests with attached responses, terminated requests, and
1629 * locked requests are ignored. Locked requests will be picked up
1630 * in a later timer call.
1632 void
1633 nfs_timer(void *arg /* never used */)
1635 struct nfsmount *nmp;
1636 struct nfsreq *req;
1637 #ifndef NFS_NOSERVER
1638 struct nfssvc_sock *slp;
1639 u_quad_t cur_usec;
1640 #endif /* NFS_NOSERVER */
1642 crit_enter();
1643 TAILQ_FOREACH(nmp, &nfs_mountq, nm_entry) {
1644 TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
1645 KKASSERT(nmp == req->r_nmp);
1646 if (req->r_mrep)
1647 continue;
1648 if (req->r_flags & (R_SOFTTERM | R_LOCKED))
1649 continue;
1650 req->r_flags |= R_LOCKED;
1651 if (nfs_sigintr(nmp, req, req->r_td)) {
1652 nfs_softterm(req, 1);
1653 } else {
1654 nfs_timer_req(req);
1656 req->r_flags &= ~R_LOCKED;
1657 if (req->r_flags & R_WANTED) {
1658 req->r_flags &= ~R_WANTED;
1659 wakeup(req);
1663 #ifndef NFS_NOSERVER
1666 * Scan the write gathering queues for writes that need to be
1667 * completed now.
1669 cur_usec = nfs_curusec();
1670 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1671 if (slp->ns_tq.lh_first && slp->ns_tq.lh_first->nd_time<=cur_usec)
1672 nfsrv_wakenfsd(slp, 1);
1674 #endif /* NFS_NOSERVER */
1675 crit_exit();
1676 callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer, NULL);
1679 static
1680 void
1681 nfs_timer_req(struct nfsreq *req)
1683 struct thread *td = &thread0; /* XXX for creds, will break if sleep */
1684 struct nfsmount *nmp = req->r_nmp;
1685 struct mbuf *m;
1686 struct socket *so;
1687 int timeo;
1688 int error;
1691 * rtt ticks and timeout calculation. Return if the timeout
1692 * has not been reached yet, unless the packet is flagged
1693 * for an immediate send.
1695 * The mean rtt doesn't help when we get random I/Os, we have
1696 * to multiply by fairly large numbers.
1698 if (req->r_rtt >= 0) {
1699 req->r_rtt++;
1700 if (nmp->nm_flag & NFSMNT_DUMBTIMR) {
1701 timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
1702 } else if (req->r_flags & R_TIMING) {
1703 timeo = NFS_SRTT(req) + NFS_SDRTT(req);
1704 } else {
1705 timeo = nmp->nm_timeo << NFS_RTT_SCALE_BITS;
1707 /* timeo is still scaled by SCALE_BITS */
1709 #define NFSFS (NFS_RTT_SCALE * NFS_HZ)
1710 if (req->r_flags & R_TIMING) {
1711 static long last_time;
1712 if (nfs_showrtt && last_time != time_second) {
1713 kprintf("rpccmd %d NFS SRTT %d SDRTT %d "
1714 "timeo %d.%03d\n",
1715 proct[req->r_procnum],
1716 NFS_SRTT(req), NFS_SDRTT(req),
1717 timeo / NFSFS,
1718 timeo % NFSFS * 1000 / NFSFS);
1719 last_time = time_second;
1722 #undef NFSFS
1725 * deal with nfs_timer jitter.
1727 timeo = (timeo >> NFS_RTT_SCALE_BITS) + 1;
1728 if (timeo < 2)
1729 timeo = 2;
1731 if (nmp->nm_timeouts > 0)
1732 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1733 if (timeo > NFS_MAXTIMEO)
1734 timeo = NFS_MAXTIMEO;
1735 if (req->r_rtt <= timeo) {
1736 if ((req->r_flags & R_NEEDSXMIT) == 0)
1737 return;
1738 } else if (nmp->nm_timeouts < 8) {
1739 nmp->nm_timeouts++;
1744 * Check for server not responding
1746 if ((req->r_flags & R_TPRINTFMSG) == 0 &&
1747 req->r_rexmit > nmp->nm_deadthresh) {
1748 nfs_msg(req->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
1749 "not responding");
1750 req->r_flags |= R_TPRINTFMSG;
1752 if (req->r_rexmit >= req->r_retry) { /* too many */
1753 nfsstats.rpctimeouts++;
1754 nfs_softterm(req, 1);
1755 return;
1759 * Generally disable retransmission on reliable sockets,
1760 * unless the request is flagged for immediate send.
1762 if (nmp->nm_sotype != SOCK_DGRAM) {
1763 if (++req->r_rexmit > NFS_MAXREXMIT)
1764 req->r_rexmit = NFS_MAXREXMIT;
1765 if ((req->r_flags & R_NEEDSXMIT) == 0)
1766 return;
1770 * Stop here if we do not have a socket!
1772 if ((so = nmp->nm_so) == NULL)
1773 return;
1776 * If there is enough space and the window allows.. resend it.
1778 * Set r_rtt to -1 in case we fail to send it now.
1780 req->r_rtt = -1;
1781 if (ssb_space(&so->so_snd) >= req->r_mreq->m_pkthdr.len &&
1782 (req->r_flags & (R_SENT | R_NEEDSXMIT)) &&
1783 (m = m_copym(req->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){
1784 req->r_flags &= ~R_NEEDSXMIT;
1785 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
1786 error = so_pru_send(so, 0, m, NULL, NULL, td);
1787 else
1788 error = so_pru_send(so, 0, m, nmp->nm_nam,
1789 NULL, td);
1790 if (error) {
1791 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
1792 so->so_error = 0;
1793 req->r_flags |= R_NEEDSXMIT;
1794 } else if (req->r_mrep == NULL) {
1796 * Iff first send, start timing
1797 * else turn timing off, backoff timer
1798 * and divide congestion window by 2.
1800 * It is possible for the so_pru_send() to
1801 * block and for us to race a reply so we
1802 * only do this if the reply field has not
1803 * been filled in. R_LOCKED will prevent
1804 * the request from being ripped out from under
1805 * us entirely.
1807 if (req->r_flags & R_SENT) {
1808 if (nfs_showrexmit)
1809 kprintf("X");
1810 req->r_flags &= ~R_TIMING;
1811 if (++req->r_rexmit > NFS_MAXREXMIT)
1812 req->r_rexmit = NFS_MAXREXMIT;
1813 nmp->nm_maxasync_scaled >>= 1;
1814 if (nmp->nm_maxasync_scaled < NFS_MINASYNC_SCALED)
1815 nmp->nm_maxasync_scaled = NFS_MINASYNC_SCALED;
1816 nfsstats.rpcretries++;
1817 } else {
1818 req->r_flags |= R_SENT;
1820 req->r_rtt = 0;
1826 * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1827 * wait for all requests to complete. This is used by forced unmounts
1828 * to terminate any outstanding RPCs.
1830 * Locked requests cannot be canceled but will be marked for
1831 * soft-termination.
1834 nfs_nmcancelreqs(struct nfsmount *nmp)
1836 struct nfsreq *req;
1837 int i;
1839 crit_enter();
1840 TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
1841 if (req->r_mrep != NULL || (req->r_flags & R_SOFTTERM))
1842 continue;
1843 nfs_softterm(req, 0);
1845 /* XXX the other two queues as well */
1846 crit_exit();
1848 for (i = 0; i < 30; i++) {
1849 crit_enter();
1850 TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) {
1851 if (nmp == req->r_nmp)
1852 break;
1854 crit_exit();
1855 if (req == NULL)
1856 return (0);
1857 tsleep(&lbolt, 0, "nfscancel", 0);
1859 return (EBUSY);
1863 * Soft-terminate a request, effectively marking it as failed.
1865 * Must be called from within a critical section.
1867 static void
1868 nfs_softterm(struct nfsreq *rep, int islocked)
1870 rep->r_flags |= R_SOFTTERM;
1871 nfs_hardterm(rep, islocked);
1875 * Hard-terminate a request, typically after getting a response.
1877 * The state machine can still decide to re-issue it later if necessary.
1879 * Must be called from within a critical section.
1881 static void
1882 nfs_hardterm(struct nfsreq *rep, int islocked)
1884 struct nfsmount *nmp = rep->r_nmp;
1887 * The nm_send count is decremented now to avoid deadlocks
1888 * when the process in soreceive() hasn't yet managed to send
1889 * its own request.
1891 if (rep->r_flags & R_SENT) {
1892 rep->r_flags &= ~R_SENT;
1896 * If we locked the request or nobody else has locked the request,
1897 * and the request is async, we can move it to the reader thread's
1898 * queue now and fix up the state.
1900 * If we locked the request or nobody else has locked the request,
1901 * we can wake up anyone blocked waiting for a response on the
1902 * request.
1904 if (islocked || (rep->r_flags & R_LOCKED) == 0) {
1905 if ((rep->r_flags & (R_ONREQQ | R_ASYNC)) ==
1906 (R_ONREQQ | R_ASYNC)) {
1907 rep->r_flags &= ~R_ONREQQ;
1908 TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain);
1909 --nmp->nm_reqqlen;
1910 TAILQ_INSERT_TAIL(&nmp->nm_reqrxq, rep, r_chain);
1911 KKASSERT(rep->r_info->state == NFSM_STATE_TRY ||
1912 rep->r_info->state == NFSM_STATE_WAITREPLY);
1913 rep->r_info->state = NFSM_STATE_PROCESSREPLY;
1914 nfssvc_iod_reader_wakeup(nmp);
1916 mtx_abort_ex_link(&nmp->nm_rxlock, &rep->r_link);
1921 * Test for a termination condition pending on the process.
1922 * This is used for NFSMNT_INT mounts.
1925 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
1927 sigset_t tmpset;
1928 struct proc *p;
1929 struct lwp *lp;
1931 if (rep && (rep->r_flags & R_SOFTTERM))
1932 return (EINTR);
1933 /* Terminate all requests while attempting a forced unmount. */
1934 if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
1935 return (EINTR);
1936 if (!(nmp->nm_flag & NFSMNT_INT))
1937 return (0);
1938 /* td might be NULL YYY */
1939 if (td == NULL || (p = td->td_proc) == NULL)
1940 return (0);
1942 lp = td->td_lwp;
1943 tmpset = lwp_sigpend(lp);
1944 SIGSETNAND(tmpset, lp->lwp_sigmask);
1945 SIGSETNAND(tmpset, p->p_sigignore);
1946 if (SIGNOTEMPTY(tmpset) && NFSINT_SIGMASK(tmpset))
1947 return (EINTR);
1949 return (0);
1953 * Lock a socket against others.
1954 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1955 * and also to avoid race conditions between the processes with nfs requests
1956 * in progress when a reconnect is necessary.
1959 nfs_sndlock(struct nfsmount *nmp, struct nfsreq *rep)
1961 mtx_t mtx = &nmp->nm_txlock;
1962 struct thread *td;
1963 int slptimeo;
1964 int slpflag;
1965 int error;
1967 slpflag = 0;
1968 slptimeo = 0;
1969 td = rep ? rep->r_td : NULL;
1970 if (nmp->nm_flag & NFSMNT_INT)
1971 slpflag = PCATCH;
1973 while ((error = mtx_lock_ex_try(mtx)) != 0) {
1974 if (nfs_sigintr(nmp, rep, td)) {
1975 error = EINTR;
1976 break;
1978 error = mtx_lock_ex(mtx, "nfsndlck", slpflag, slptimeo);
1979 if (error == 0)
1980 break;
1981 if (slpflag == PCATCH) {
1982 slpflag = 0;
1983 slptimeo = 2 * hz;
1986 /* Always fail if our request has been cancelled. */
1987 if (rep && (rep->r_flags & R_SOFTTERM)) {
1988 if (error == 0)
1989 mtx_unlock(mtx);
1990 error = EINTR;
1992 return (error);
1996 * Unlock the stream socket for others.
1998 void
1999 nfs_sndunlock(struct nfsmount *nmp)
2001 mtx_unlock(&nmp->nm_txlock);
2005 * Lock the receiver side of the socket.
2007 * rep may be NULL.
2009 static int
2010 nfs_rcvlock(struct nfsmount *nmp, struct nfsreq *rep)
2012 mtx_t mtx = &nmp->nm_rxlock;
2013 int slpflag;
2014 int slptimeo;
2015 int error;
2018 * Unconditionally check for completion in case another nfsiod
2019 * get the packet while the caller was blocked, before the caller
2020 * called us. Packet reception is handled by mainline code which
2021 * is protected by the BGL at the moment.
2023 * We do not strictly need the second check just before the
2024 * tsleep(), but it's good defensive programming.
2026 if (rep && rep->r_mrep != NULL)
2027 return (EALREADY);
2029 if (nmp->nm_flag & NFSMNT_INT)
2030 slpflag = PCATCH;
2031 else
2032 slpflag = 0;
2033 slptimeo = 0;
2035 while ((error = mtx_lock_ex_try(mtx)) != 0) {
2036 if (nfs_sigintr(nmp, rep, (rep ? rep->r_td : NULL))) {
2037 error = EINTR;
2038 break;
2040 if (rep && rep->r_mrep != NULL) {
2041 error = EALREADY;
2042 break;
2046 * NOTE: can return ENOLCK, but in that case rep->r_mrep
2047 * will already be set.
2049 if (rep) {
2050 error = mtx_lock_ex_link(mtx, &rep->r_link,
2051 "nfsrcvlk",
2052 slpflag, slptimeo);
2053 } else {
2054 error = mtx_lock_ex(mtx, "nfsrcvlk", slpflag, slptimeo);
2056 if (error == 0)
2057 break;
2060 * If our reply was recieved while we were sleeping,
2061 * then just return without taking the lock to avoid a
2062 * situation where a single iod could 'capture' the
2063 * recieve lock.
2065 if (rep && rep->r_mrep != NULL) {
2066 error = EALREADY;
2067 break;
2069 if (slpflag == PCATCH) {
2070 slpflag = 0;
2071 slptimeo = 2 * hz;
2074 if (error == 0) {
2075 if (rep && rep->r_mrep != NULL) {
2076 error = EALREADY;
2077 mtx_unlock(mtx);
2080 return (error);
2084 * Unlock the stream socket for others.
2086 static void
2087 nfs_rcvunlock(struct nfsmount *nmp)
2089 mtx_unlock(&nmp->nm_rxlock);
2093 * nfs_realign:
2095 * Check for badly aligned mbuf data and realign by copying the unaligned
2096 * portion of the data into a new mbuf chain and freeing the portions
2097 * of the old chain that were replaced.
2099 * We cannot simply realign the data within the existing mbuf chain
2100 * because the underlying buffers may contain other rpc commands and
2101 * we cannot afford to overwrite them.
2103 * We would prefer to avoid this situation entirely. The situation does
2104 * not occur with NFS/UDP and is supposed to only occassionally occur
2105 * with TCP. Use vfs.nfs.realign_count and realign_test to check this.
2107 static void
2108 nfs_realign(struct mbuf **pm, int hsiz)
2110 struct mbuf *m;
2111 struct mbuf *n = NULL;
2112 int off = 0;
2114 ++nfs_realign_test;
2116 while ((m = *pm) != NULL) {
2117 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
2118 n = m_getl(m->m_len, MB_WAIT, MT_DATA, 0, NULL);
2119 n->m_len = 0;
2120 break;
2122 pm = &m->m_next;
2126 * If n is non-NULL, loop on m copying data, then replace the
2127 * portion of the chain that had to be realigned.
2129 if (n != NULL) {
2130 ++nfs_realign_count;
2131 while (m) {
2132 m_copyback(n, off, m->m_len, mtod(m, caddr_t));
2133 off += m->m_len;
2134 m = m->m_next;
2136 m_freem(*pm);
2137 *pm = n;
2141 #ifndef NFS_NOSERVER
2144 * Parse an RPC request
2145 * - verify it
2146 * - fill in the cred struct.
2149 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
2151 int len, i;
2152 u_int32_t *tl;
2153 struct uio uio;
2154 struct iovec iov;
2155 caddr_t cp;
2156 u_int32_t nfsvers, auth_type;
2157 uid_t nickuid;
2158 int error = 0, ticklen;
2159 struct nfsuid *nuidp;
2160 struct timeval tvin, tvout;
2161 struct nfsm_info info;
2162 #if 0 /* until encrypted keys are implemented */
2163 NFSKERBKEYSCHED_T keys; /* stores key schedule */
2164 #endif
2166 info.mrep = nd->nd_mrep;
2167 info.md = nd->nd_md;
2168 info.dpos = nd->nd_dpos;
2170 if (has_header) {
2171 NULLOUT(tl = nfsm_dissect(&info, 10 * NFSX_UNSIGNED));
2172 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++);
2173 if (*tl++ != rpc_call) {
2174 m_freem(info.mrep);
2175 return (EBADRPC);
2177 } else {
2178 NULLOUT(tl = nfsm_dissect(&info, 8 * NFSX_UNSIGNED));
2180 nd->nd_repstat = 0;
2181 nd->nd_flag = 0;
2182 if (*tl++ != rpc_vers) {
2183 nd->nd_repstat = ERPCMISMATCH;
2184 nd->nd_procnum = NFSPROC_NOOP;
2185 return (0);
2187 if (*tl != nfs_prog) {
2188 nd->nd_repstat = EPROGUNAVAIL;
2189 nd->nd_procnum = NFSPROC_NOOP;
2190 return (0);
2192 tl++;
2193 nfsvers = fxdr_unsigned(u_int32_t, *tl++);
2194 if (nfsvers < NFS_VER2 || nfsvers > NFS_VER3) {
2195 nd->nd_repstat = EPROGMISMATCH;
2196 nd->nd_procnum = NFSPROC_NOOP;
2197 return (0);
2199 if (nfsvers == NFS_VER3)
2200 nd->nd_flag = ND_NFSV3;
2201 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++);
2202 if (nd->nd_procnum == NFSPROC_NULL)
2203 return (0);
2204 if (nd->nd_procnum >= NFS_NPROCS ||
2205 (nd->nd_procnum >= NQNFSPROC_GETLEASE) ||
2206 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
2207 nd->nd_repstat = EPROCUNAVAIL;
2208 nd->nd_procnum = NFSPROC_NOOP;
2209 return (0);
2211 if ((nd->nd_flag & ND_NFSV3) == 0)
2212 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
2213 auth_type = *tl++;
2214 len = fxdr_unsigned(int, *tl++);
2215 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2216 m_freem(info.mrep);
2217 return (EBADRPC);
2220 nd->nd_flag &= ~ND_KERBAUTH;
2222 * Handle auth_unix or auth_kerb.
2224 if (auth_type == rpc_auth_unix) {
2225 len = fxdr_unsigned(int, *++tl);
2226 if (len < 0 || len > NFS_MAXNAMLEN) {
2227 m_freem(info.mrep);
2228 return (EBADRPC);
2230 ERROROUT(nfsm_adv(&info, nfsm_rndup(len)));
2231 NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED));
2232 bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred));
2233 nd->nd_cr.cr_ref = 1;
2234 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
2235 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
2236 len = fxdr_unsigned(int, *tl);
2237 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
2238 m_freem(info.mrep);
2239 return (EBADRPC);
2241 NULLOUT(tl = nfsm_dissect(&info, (len + 2) * NFSX_UNSIGNED));
2242 for (i = 1; i <= len; i++)
2243 if (i < NGROUPS)
2244 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
2245 else
2246 tl++;
2247 nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
2248 if (nd->nd_cr.cr_ngroups > 1)
2249 nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups);
2250 len = fxdr_unsigned(int, *++tl);
2251 if (len < 0 || len > RPCAUTH_MAXSIZ) {
2252 m_freem(info.mrep);
2253 return (EBADRPC);
2255 if (len > 0) {
2256 ERROROUT(nfsm_adv(&info, nfsm_rndup(len)));
2258 } else if (auth_type == rpc_auth_kerb) {
2259 switch (fxdr_unsigned(int, *tl++)) {
2260 case RPCAKN_FULLNAME:
2261 ticklen = fxdr_unsigned(int, *tl);
2262 *((u_int32_t *)nfsd->nfsd_authstr) = *tl;
2263 uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED;
2264 nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED;
2265 if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) {
2266 m_freem(info.mrep);
2267 return (EBADRPC);
2269 uio.uio_offset = 0;
2270 uio.uio_iov = &iov;
2271 uio.uio_iovcnt = 1;
2272 uio.uio_segflg = UIO_SYSSPACE;
2273 iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4];
2274 iov.iov_len = RPCAUTH_MAXSIZ - 4;
2275 ERROROUT(nfsm_mtouio(&info, &uio, uio.uio_resid));
2276 NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED));
2277 if (*tl++ != rpc_auth_kerb ||
2278 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
2279 kprintf("Bad kerb verifier\n");
2280 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2281 nd->nd_procnum = NFSPROC_NOOP;
2282 return (0);
2284 NULLOUT(cp = nfsm_dissect(&info, 4 * NFSX_UNSIGNED));
2285 tl = (u_int32_t *)cp;
2286 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
2287 kprintf("Not fullname kerb verifier\n");
2288 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2289 nd->nd_procnum = NFSPROC_NOOP;
2290 return (0);
2292 cp += NFSX_UNSIGNED;
2293 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
2294 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
2295 nd->nd_flag |= ND_KERBFULL;
2296 nfsd->nfsd_flag |= NFSD_NEEDAUTH;
2297 break;
2298 case RPCAKN_NICKNAME:
2299 if (len != 2 * NFSX_UNSIGNED) {
2300 kprintf("Kerb nickname short\n");
2301 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
2302 nd->nd_procnum = NFSPROC_NOOP;
2303 return (0);
2305 nickuid = fxdr_unsigned(uid_t, *tl);
2306 NULLOUT(tl = nfsm_dissect(&info, 2 * NFSX_UNSIGNED));
2307 if (*tl++ != rpc_auth_kerb ||
2308 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
2309 kprintf("Kerb nick verifier bad\n");
2310 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
2311 nd->nd_procnum = NFSPROC_NOOP;
2312 return (0);
2314 NULLOUT(tl = nfsm_dissect(&info, 3 * NFSX_UNSIGNED));
2315 tvin.tv_sec = *tl++;
2316 tvin.tv_usec = *tl;
2318 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
2319 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
2320 if (nuidp->nu_cr.cr_uid == nickuid &&
2321 (!nd->nd_nam2 ||
2322 netaddr_match(NU_NETFAM(nuidp),
2323 &nuidp->nu_haddr, nd->nd_nam2)))
2324 break;
2326 if (!nuidp) {
2327 nd->nd_repstat =
2328 (NFSERR_AUTHERR|AUTH_REJECTCRED);
2329 nd->nd_procnum = NFSPROC_NOOP;
2330 return (0);
2334 * Now, decrypt the timestamp using the session key
2335 * and validate it.
2337 #ifdef NFSKERB
2339 #endif
2341 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
2342 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
2343 if (nuidp->nu_expire < time_second ||
2344 nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
2345 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
2346 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
2347 nuidp->nu_expire = 0;
2348 nd->nd_repstat =
2349 (NFSERR_AUTHERR|AUTH_REJECTVERF);
2350 nd->nd_procnum = NFSPROC_NOOP;
2351 return (0);
2353 nfsrv_setcred(&nuidp->nu_cr, &nd->nd_cr);
2354 nd->nd_flag |= ND_KERBNICK;
2356 } else {
2357 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
2358 nd->nd_procnum = NFSPROC_NOOP;
2359 return (0);
2362 nd->nd_md = info.md;
2363 nd->nd_dpos = info.dpos;
2364 return (0);
2365 nfsmout:
2366 return (error);
2369 #endif
2372 * Send a message to the originating process's terminal. The thread and/or
2373 * process may be NULL. YYY the thread should not be NULL but there may
2374 * still be some uio_td's that are still being passed as NULL through to
2375 * nfsm_request().
2377 static int
2378 nfs_msg(struct thread *td, char *server, char *msg)
2380 tpr_t tpr;
2382 if (td && td->td_proc)
2383 tpr = tprintf_open(td->td_proc);
2384 else
2385 tpr = NULL;
2386 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2387 tprintf_close(tpr);
2388 return (0);
2391 #ifndef NFS_NOSERVER
2393 * Socket upcall routine for the nfsd sockets.
2394 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2395 * Essentially do as much as possible non-blocking, else punt and it will
2396 * be called with MB_WAIT from an nfsd.
2398 void
2399 nfsrv_rcv(struct socket *so, void *arg, int waitflag)
2401 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2402 struct mbuf *m;
2403 struct sockaddr *nam;
2404 struct sockbuf sio;
2405 int flags, error;
2406 int nparallel_wakeup = 0;
2408 if ((slp->ns_flag & SLP_VALID) == 0)
2409 return;
2412 * Do not allow an infinite number of completed RPC records to build
2413 * up before we stop reading data from the socket. Otherwise we could
2414 * end up holding onto an unreasonable number of mbufs for requests
2415 * waiting for service.
2417 * This should give pretty good feedback to the TCP
2418 * layer and prevents a memory crunch for other protocols.
2420 * Note that the same service socket can be dispatched to several
2421 * nfs servers simultaniously.
2423 * the tcp protocol callback calls us with MB_DONTWAIT.
2424 * nfsd calls us with MB_WAIT (typically).
2426 if (waitflag == MB_DONTWAIT && slp->ns_numrec >= nfsd_waiting / 2 + 1) {
2427 slp->ns_flag |= SLP_NEEDQ;
2428 goto dorecs;
2432 * Handle protocol specifics to parse an RPC request. We always
2433 * pull from the socket using non-blocking I/O.
2435 if (so->so_type == SOCK_STREAM) {
2437 * The data has to be read in an orderly fashion from a TCP
2438 * stream, unlike a UDP socket. It is possible for soreceive
2439 * and/or nfsrv_getstream() to block, so make sure only one
2440 * entity is messing around with the TCP stream at any given
2441 * moment. The receive sockbuf's lock in soreceive is not
2442 * sufficient.
2444 * Note that this procedure can be called from any number of
2445 * NFS severs *OR* can be upcalled directly from a TCP
2446 * protocol thread.
2448 if (slp->ns_flag & SLP_GETSTREAM) {
2449 slp->ns_flag |= SLP_NEEDQ;
2450 goto dorecs;
2452 slp->ns_flag |= SLP_GETSTREAM;
2455 * Do soreceive(). Pull out as much data as possible without
2456 * blocking.
2458 sbinit(&sio, 1000000000);
2459 flags = MSG_DONTWAIT;
2460 error = so_pru_soreceive(so, &nam, NULL, &sio, NULL, &flags);
2461 if (error || sio.sb_mb == NULL) {
2462 if (error == EWOULDBLOCK)
2463 slp->ns_flag |= SLP_NEEDQ;
2464 else
2465 slp->ns_flag |= SLP_DISCONN;
2466 slp->ns_flag &= ~SLP_GETSTREAM;
2467 goto dorecs;
2469 m = sio.sb_mb;
2470 if (slp->ns_rawend) {
2471 slp->ns_rawend->m_next = m;
2472 slp->ns_cc += sio.sb_cc;
2473 } else {
2474 slp->ns_raw = m;
2475 slp->ns_cc = sio.sb_cc;
2477 while (m->m_next)
2478 m = m->m_next;
2479 slp->ns_rawend = m;
2482 * Now try and parse as many record(s) as we can out of the
2483 * raw stream data.
2485 error = nfsrv_getstream(slp, waitflag, &nparallel_wakeup);
2486 if (error) {
2487 if (error == EPERM)
2488 slp->ns_flag |= SLP_DISCONN;
2489 else
2490 slp->ns_flag |= SLP_NEEDQ;
2492 slp->ns_flag &= ~SLP_GETSTREAM;
2493 } else {
2495 * For UDP soreceive typically pulls just one packet, loop
2496 * to get the whole batch.
2498 do {
2499 sbinit(&sio, 1000000000);
2500 flags = MSG_DONTWAIT;
2501 error = so_pru_soreceive(so, &nam, NULL, &sio,
2502 NULL, &flags);
2503 if (sio.sb_mb) {
2504 struct nfsrv_rec *rec;
2505 int mf = (waitflag & MB_DONTWAIT) ?
2506 M_NOWAIT : M_WAITOK;
2507 rec = kmalloc(sizeof(struct nfsrv_rec),
2508 M_NFSRVDESC, mf);
2509 if (!rec) {
2510 if (nam)
2511 FREE(nam, M_SONAME);
2512 m_freem(sio.sb_mb);
2513 continue;
2515 nfs_realign(&sio.sb_mb, 10 * NFSX_UNSIGNED);
2516 rec->nr_address = nam;
2517 rec->nr_packet = sio.sb_mb;
2518 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2519 ++slp->ns_numrec;
2520 ++nparallel_wakeup;
2522 if (error) {
2523 if ((so->so_proto->pr_flags & PR_CONNREQUIRED)
2524 && error != EWOULDBLOCK) {
2525 slp->ns_flag |= SLP_DISCONN;
2526 goto dorecs;
2529 } while (sio.sb_mb);
2533 * If we were upcalled from the tcp protocol layer and we have
2534 * fully parsed records ready to go, or there is new data pending,
2535 * or something went wrong, try to wake up an nfsd thread to deal
2536 * with it.
2538 dorecs:
2539 if (waitflag == MB_DONTWAIT && (slp->ns_numrec > 0
2540 || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) {
2541 nfsrv_wakenfsd(slp, nparallel_wakeup);
2546 * Try and extract an RPC request from the mbuf data list received on a
2547 * stream socket. The "waitflag" argument indicates whether or not it
2548 * can sleep.
2550 static int
2551 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag, int *countp)
2553 struct mbuf *m, **mpp;
2554 char *cp1, *cp2;
2555 int len;
2556 struct mbuf *om, *m2, *recm;
2557 u_int32_t recmark;
2559 for (;;) {
2560 if (slp->ns_reclen == 0) {
2561 if (slp->ns_cc < NFSX_UNSIGNED)
2562 return (0);
2563 m = slp->ns_raw;
2564 if (m->m_len >= NFSX_UNSIGNED) {
2565 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED);
2566 m->m_data += NFSX_UNSIGNED;
2567 m->m_len -= NFSX_UNSIGNED;
2568 } else {
2569 cp1 = (caddr_t)&recmark;
2570 cp2 = mtod(m, caddr_t);
2571 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2572 while (m->m_len == 0) {
2573 m = m->m_next;
2574 cp2 = mtod(m, caddr_t);
2576 *cp1++ = *cp2++;
2577 m->m_data++;
2578 m->m_len--;
2581 slp->ns_cc -= NFSX_UNSIGNED;
2582 recmark = ntohl(recmark);
2583 slp->ns_reclen = recmark & ~0x80000000;
2584 if (recmark & 0x80000000)
2585 slp->ns_flag |= SLP_LASTFRAG;
2586 else
2587 slp->ns_flag &= ~SLP_LASTFRAG;
2588 if (slp->ns_reclen > NFS_MAXPACKET || slp->ns_reclen <= 0) {
2589 log(LOG_ERR, "%s (%d) from nfs client\n",
2590 "impossible packet length",
2591 slp->ns_reclen);
2592 return (EPERM);
2597 * Now get the record part.
2599 * Note that slp->ns_reclen may be 0. Linux sometimes
2600 * generates 0-length RPCs
2602 recm = NULL;
2603 if (slp->ns_cc == slp->ns_reclen) {
2604 recm = slp->ns_raw;
2605 slp->ns_raw = slp->ns_rawend = NULL;
2606 slp->ns_cc = slp->ns_reclen = 0;
2607 } else if (slp->ns_cc > slp->ns_reclen) {
2608 len = 0;
2609 m = slp->ns_raw;
2610 om = NULL;
2612 while (len < slp->ns_reclen) {
2613 if ((len + m->m_len) > slp->ns_reclen) {
2614 m2 = m_copym(m, 0, slp->ns_reclen - len,
2615 waitflag);
2616 if (m2) {
2617 if (om) {
2618 om->m_next = m2;
2619 recm = slp->ns_raw;
2620 } else
2621 recm = m2;
2622 m->m_data += slp->ns_reclen - len;
2623 m->m_len -= slp->ns_reclen - len;
2624 len = slp->ns_reclen;
2625 } else {
2626 return (EWOULDBLOCK);
2628 } else if ((len + m->m_len) == slp->ns_reclen) {
2629 om = m;
2630 len += m->m_len;
2631 m = m->m_next;
2632 recm = slp->ns_raw;
2633 om->m_next = NULL;
2634 } else {
2635 om = m;
2636 len += m->m_len;
2637 m = m->m_next;
2640 slp->ns_raw = m;
2641 slp->ns_cc -= len;
2642 slp->ns_reclen = 0;
2643 } else {
2644 return (0);
2648 * Accumulate the fragments into a record.
2650 mpp = &slp->ns_frag;
2651 while (*mpp)
2652 mpp = &((*mpp)->m_next);
2653 *mpp = recm;
2654 if (slp->ns_flag & SLP_LASTFRAG) {
2655 struct nfsrv_rec *rec;
2656 int mf = (waitflag & MB_DONTWAIT) ? M_NOWAIT : M_WAITOK;
2657 rec = kmalloc(sizeof(struct nfsrv_rec), M_NFSRVDESC, mf);
2658 if (!rec) {
2659 m_freem(slp->ns_frag);
2660 } else {
2661 nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
2662 rec->nr_address = NULL;
2663 rec->nr_packet = slp->ns_frag;
2664 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2665 ++slp->ns_numrec;
2666 ++*countp;
2668 slp->ns_frag = NULL;
2674 * Parse an RPC header.
2677 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd,
2678 struct nfsrv_descript **ndp)
2680 struct nfsrv_rec *rec;
2681 struct mbuf *m;
2682 struct sockaddr *nam;
2683 struct nfsrv_descript *nd;
2684 int error;
2686 *ndp = NULL;
2687 if ((slp->ns_flag & SLP_VALID) == 0 || !STAILQ_FIRST(&slp->ns_rec))
2688 return (ENOBUFS);
2689 rec = STAILQ_FIRST(&slp->ns_rec);
2690 STAILQ_REMOVE_HEAD(&slp->ns_rec, nr_link);
2691 KKASSERT(slp->ns_numrec > 0);
2692 --slp->ns_numrec;
2693 nam = rec->nr_address;
2694 m = rec->nr_packet;
2695 kfree(rec, M_NFSRVDESC);
2696 MALLOC(nd, struct nfsrv_descript *, sizeof (struct nfsrv_descript),
2697 M_NFSRVDESC, M_WAITOK);
2698 nd->nd_md = nd->nd_mrep = m;
2699 nd->nd_nam2 = nam;
2700 nd->nd_dpos = mtod(m, caddr_t);
2701 error = nfs_getreq(nd, nfsd, TRUE);
2702 if (error) {
2703 if (nam) {
2704 FREE(nam, M_SONAME);
2706 kfree((caddr_t)nd, M_NFSRVDESC);
2707 return (error);
2709 *ndp = nd;
2710 nfsd->nfsd_nd = nd;
2711 return (0);
2715 * Try to assign service sockets to nfsd threads based on the number
2716 * of new rpc requests that have been queued on the service socket.
2718 * If no nfsd's are available or additonal requests are pending, set the
2719 * NFSD_CHECKSLP flag so that one of the running nfsds will go look for
2720 * the work in the nfssvc_sock list when it is finished processing its
2721 * current work. This flag is only cleared when an nfsd can not find
2722 * any new work to perform.
2724 void
2725 nfsrv_wakenfsd(struct nfssvc_sock *slp, int nparallel)
2727 struct nfsd *nd;
2729 if ((slp->ns_flag & SLP_VALID) == 0)
2730 return;
2731 if (nparallel <= 1)
2732 nparallel = 1;
2733 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2734 if (nd->nfsd_flag & NFSD_WAITING) {
2735 nd->nfsd_flag &= ~NFSD_WAITING;
2736 if (nd->nfsd_slp)
2737 panic("nfsd wakeup");
2738 slp->ns_sref++;
2739 nd->nfsd_slp = slp;
2740 wakeup((caddr_t)nd);
2741 if (--nparallel == 0)
2742 break;
2745 if (nparallel) {
2746 slp->ns_flag |= SLP_DOREC;
2747 nfsd_head_flag |= NFSD_CHECKSLP;
2750 #endif /* NFS_NOSERVER */