Updated the list of USB device IDs for uvisor(4) and needed routines.
[dragonfly.git] / sys / vfs / nfs / nfs_socket.c
blobf0b09ae55181edc8907b23e331835c6ebc61d16f
1 /*
2 * Copyright (c) 1989, 1991, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
36 * @(#)nfs_socket.c 8.5 (Berkeley) 3/30/95
37 * $FreeBSD: src/sys/nfs/nfs_socket.c,v 1.60.2.6 2003/03/26 01:44:46 alfred Exp $
38 * $DragonFly: src/sys/vfs/nfs/nfs_socket.c,v 1.45 2007/05/18 17:05:13 dillon Exp $
42 * Socket operations for use by nfs
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/proc.h>
48 #include <sys/malloc.h>
49 #include <sys/mount.h>
50 #include <sys/kernel.h>
51 #include <sys/mbuf.h>
52 #include <sys/vnode.h>
53 #include <sys/fcntl.h>
54 #include <sys/protosw.h>
55 #include <sys/resourcevar.h>
56 #include <sys/socket.h>
57 #include <sys/socketvar.h>
58 #include <sys/socketops.h>
59 #include <sys/syslog.h>
60 #include <sys/thread.h>
61 #include <sys/tprintf.h>
62 #include <sys/sysctl.h>
63 #include <sys/signalvar.h>
64 #include <sys/signal2.h>
66 #include <netinet/in.h>
67 #include <netinet/tcp.h>
68 #include <sys/thread2.h>
70 #include "rpcv2.h"
71 #include "nfsproto.h"
72 #include "nfs.h"
73 #include "xdr_subs.h"
74 #include "nfsm_subs.h"
75 #include "nfsmount.h"
76 #include "nfsnode.h"
77 #include "nfsrtt.h"
79 #define TRUE 1
80 #define FALSE 0
83 * Estimate rto for an nfs rpc sent via. an unreliable datagram.
84 * Use the mean and mean deviation of rtt for the appropriate type of rpc
85 * for the frequent rpcs and a default for the others.
86 * The justification for doing "other" this way is that these rpcs
87 * happen so infrequently that timer est. would probably be stale.
88 * Also, since many of these rpcs are
89 * non-idempotent, a conservative timeout is desired.
90 * getattr, lookup - A+2D
91 * read, write - A+4D
92 * other - nm_timeo
94 #define NFS_RTO(n, t) \
95 ((t) == 0 ? (n)->nm_timeo : \
96 ((t) < 3 ? \
97 (((((n)->nm_srtt[t-1] + 3) >> 2) + (n)->nm_sdrtt[t-1] + 1) >> 1) : \
98 ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1)))
99 #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1]
100 #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1]
102 * External data, mostly RPC constants in XDR form
104 extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers,
105 rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr,
106 rpc_auth_kerb;
107 extern u_int32_t nfs_prog;
108 extern struct nfsstats nfsstats;
109 extern int nfsv3_procid[NFS_NPROCS];
110 extern int nfs_ticks;
113 * Defines which timer to use for the procnum.
114 * 0 - default
115 * 1 - getattr
116 * 2 - lookup
117 * 3 - read
118 * 4 - write
120 static int proct[NFS_NPROCS] = {
121 0, 1, 0, 2, 1, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
122 0, 0, 0,
125 static int nfs_realign_test;
126 static int nfs_realign_count;
127 static int nfs_bufpackets = 4;
128 static int nfs_timer_raced;
130 SYSCTL_DECL(_vfs_nfs);
132 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_test, CTLFLAG_RW, &nfs_realign_test, 0, "");
133 SYSCTL_INT(_vfs_nfs, OID_AUTO, realign_count, CTLFLAG_RW, &nfs_realign_count, 0, "");
134 SYSCTL_INT(_vfs_nfs, OID_AUTO, bufpackets, CTLFLAG_RW, &nfs_bufpackets, 0, "");
138 * There is a congestion window for outstanding rpcs maintained per mount
139 * point. The cwnd size is adjusted in roughly the way that:
140 * Van Jacobson, Congestion avoidance and Control, In "Proceedings of
141 * SIGCOMM '88". ACM, August 1988.
142 * describes for TCP. The cwnd size is chopped in half on a retransmit timeout
143 * and incremented by 1/cwnd when each rpc reply is received and a full cwnd
144 * of rpcs is in progress.
145 * (The sent count and cwnd are scaled for integer arith.)
146 * Variants of "slow start" were tried and were found to be too much of a
147 * performance hit (ave. rtt 3 times larger),
148 * I suspect due to the large rtt that nfs rpcs have.
150 #define NFS_CWNDSCALE 256
151 #define NFS_MAXCWND (NFS_CWNDSCALE * 32)
152 static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, };
153 int nfsrtton = 0;
154 struct nfsrtt nfsrtt;
155 struct callout nfs_timer_handle;
157 static int nfs_msg (struct thread *,char *,char *);
158 static int nfs_rcvlock (struct nfsreq *);
159 static void nfs_rcvunlock (struct nfsreq *);
160 static void nfs_realign (struct mbuf **pm, int hsiz);
161 static int nfs_receive (struct nfsreq *rep, struct sockaddr **aname,
162 struct mbuf **mp);
163 static void nfs_softterm (struct nfsreq *rep);
164 static int nfs_reconnect (struct nfsreq *rep);
165 #ifndef NFS_NOSERVER
166 static int nfsrv_getstream (struct nfssvc_sock *, int, int *);
168 int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd,
169 struct nfssvc_sock *slp,
170 struct thread *td,
171 struct mbuf **mreqp) = {
172 nfsrv_null,
173 nfsrv_getattr,
174 nfsrv_setattr,
175 nfsrv_lookup,
176 nfsrv3_access,
177 nfsrv_readlink,
178 nfsrv_read,
179 nfsrv_write,
180 nfsrv_create,
181 nfsrv_mkdir,
182 nfsrv_symlink,
183 nfsrv_mknod,
184 nfsrv_remove,
185 nfsrv_rmdir,
186 nfsrv_rename,
187 nfsrv_link,
188 nfsrv_readdir,
189 nfsrv_readdirplus,
190 nfsrv_statfs,
191 nfsrv_fsinfo,
192 nfsrv_pathconf,
193 nfsrv_commit,
194 nfsrv_noop,
195 nfsrv_noop,
196 nfsrv_noop,
197 nfsrv_noop
199 #endif /* NFS_NOSERVER */
202 * Initialize sockets and congestion for a new NFS connection.
203 * We do not free the sockaddr if error.
206 nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
208 struct socket *so;
209 int error, rcvreserve, sndreserve;
210 int pktscale;
211 struct sockaddr *saddr;
212 struct sockaddr_in *sin;
213 struct thread *td = &thread0; /* only used for socreate and sobind */
215 nmp->nm_so = (struct socket *)0;
216 saddr = nmp->nm_nam;
217 error = socreate(saddr->sa_family, &nmp->nm_so, nmp->nm_sotype,
218 nmp->nm_soproto, td);
219 if (error)
220 goto bad;
221 so = nmp->nm_so;
222 nmp->nm_soflags = so->so_proto->pr_flags;
225 * Some servers require that the client port be a reserved port number.
227 if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) {
228 struct sockopt sopt;
229 int ip;
230 struct sockaddr_in ssin;
232 bzero(&sopt, sizeof sopt);
233 ip = IP_PORTRANGE_LOW;
234 sopt.sopt_level = IPPROTO_IP;
235 sopt.sopt_name = IP_PORTRANGE;
236 sopt.sopt_val = (void *)&ip;
237 sopt.sopt_valsize = sizeof(ip);
238 sopt.sopt_td = NULL;
239 error = sosetopt(so, &sopt);
240 if (error)
241 goto bad;
242 bzero(&ssin, sizeof ssin);
243 sin = &ssin;
244 sin->sin_len = sizeof (struct sockaddr_in);
245 sin->sin_family = AF_INET;
246 sin->sin_addr.s_addr = INADDR_ANY;
247 sin->sin_port = htons(0);
248 error = sobind(so, (struct sockaddr *)sin, td);
249 if (error)
250 goto bad;
251 bzero(&sopt, sizeof sopt);
252 ip = IP_PORTRANGE_DEFAULT;
253 sopt.sopt_level = IPPROTO_IP;
254 sopt.sopt_name = IP_PORTRANGE;
255 sopt.sopt_val = (void *)&ip;
256 sopt.sopt_valsize = sizeof(ip);
257 sopt.sopt_td = NULL;
258 error = sosetopt(so, &sopt);
259 if (error)
260 goto bad;
264 * Protocols that do not require connections may be optionally left
265 * unconnected for servers that reply from a port other than NFS_PORT.
267 if (nmp->nm_flag & NFSMNT_NOCONN) {
268 if (nmp->nm_soflags & PR_CONNREQUIRED) {
269 error = ENOTCONN;
270 goto bad;
272 } else {
273 error = soconnect(so, nmp->nm_nam, td);
274 if (error)
275 goto bad;
278 * Wait for the connection to complete. Cribbed from the
279 * connect system call but with the wait timing out so
280 * that interruptible mounts don't hang here for a long time.
282 crit_enter();
283 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
284 (void) tsleep((caddr_t)&so->so_timeo, 0,
285 "nfscon", 2 * hz);
286 if ((so->so_state & SS_ISCONNECTING) &&
287 so->so_error == 0 && rep &&
288 (error = nfs_sigintr(nmp, rep, rep->r_td)) != 0){
289 so->so_state &= ~SS_ISCONNECTING;
290 crit_exit();
291 goto bad;
294 if (so->so_error) {
295 error = so->so_error;
296 so->so_error = 0;
297 crit_exit();
298 goto bad;
300 crit_exit();
302 so->so_rcv.ssb_timeo = (5 * hz);
303 so->so_snd.ssb_timeo = (5 * hz);
306 * Get buffer reservation size from sysctl, but impose reasonable
307 * limits.
309 pktscale = nfs_bufpackets;
310 if (pktscale < 2)
311 pktscale = 2;
312 if (pktscale > 64)
313 pktscale = 64;
315 if (nmp->nm_sotype == SOCK_DGRAM) {
316 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
317 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
318 NFS_MAXPKTHDR) * pktscale;
319 } else if (nmp->nm_sotype == SOCK_SEQPACKET) {
320 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR) * pktscale;
321 rcvreserve = (max(nmp->nm_rsize, nmp->nm_readdirsize) +
322 NFS_MAXPKTHDR) * pktscale;
323 } else {
324 if (nmp->nm_sotype != SOCK_STREAM)
325 panic("nfscon sotype");
326 if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
327 struct sockopt sopt;
328 int val;
330 bzero(&sopt, sizeof sopt);
331 sopt.sopt_level = SOL_SOCKET;
332 sopt.sopt_name = SO_KEEPALIVE;
333 sopt.sopt_val = &val;
334 sopt.sopt_valsize = sizeof val;
335 val = 1;
336 sosetopt(so, &sopt);
338 if (so->so_proto->pr_protocol == IPPROTO_TCP) {
339 struct sockopt sopt;
340 int val;
342 bzero(&sopt, sizeof sopt);
343 sopt.sopt_level = IPPROTO_TCP;
344 sopt.sopt_name = TCP_NODELAY;
345 sopt.sopt_val = &val;
346 sopt.sopt_valsize = sizeof val;
347 val = 1;
348 sosetopt(so, &sopt);
350 sndreserve = (nmp->nm_wsize + NFS_MAXPKTHDR +
351 sizeof (u_int32_t)) * pktscale;
352 rcvreserve = (nmp->nm_rsize + NFS_MAXPKTHDR +
353 sizeof (u_int32_t)) * pktscale;
355 error = soreserve(so, sndreserve, rcvreserve,
356 &td->td_proc->p_rlimit[RLIMIT_SBSIZE]);
357 if (error)
358 goto bad;
359 so->so_rcv.ssb_flags |= SSB_NOINTR;
360 so->so_snd.ssb_flags |= SSB_NOINTR;
362 /* Initialize other non-zero congestion variables */
363 nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] =
364 nmp->nm_srtt[3] = (NFS_TIMEO << 3);
365 nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] =
366 nmp->nm_sdrtt[3] = 0;
367 nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */
368 nmp->nm_sent = 0;
369 nmp->nm_timeouts = 0;
370 return (0);
372 bad:
373 nfs_disconnect(nmp);
374 return (error);
378 * Reconnect routine:
379 * Called when a connection is broken on a reliable protocol.
380 * - clean up the old socket
381 * - nfs_connect() again
382 * - set R_MUSTRESEND for all outstanding requests on mount point
383 * If this fails the mount point is DEAD!
384 * nb: Must be called with the nfs_sndlock() set on the mount point.
386 static int
387 nfs_reconnect(struct nfsreq *rep)
389 struct nfsreq *rp;
390 struct nfsmount *nmp = rep->r_nmp;
391 int error;
393 nfs_disconnect(nmp);
394 while ((error = nfs_connect(nmp, rep)) != 0) {
395 if (error == EINTR || error == ERESTART)
396 return (EINTR);
397 (void) tsleep((caddr_t)&lbolt, 0, "nfscon", 0);
401 * Loop through outstanding request list and fix up all requests
402 * on old socket.
404 crit_enter();
405 TAILQ_FOREACH(rp, &nfs_reqq, r_chain) {
406 if (rp->r_nmp == nmp)
407 rp->r_flags |= R_MUSTRESEND;
409 crit_exit();
410 return (0);
414 * NFS disconnect. Clean up and unlink.
416 void
417 nfs_disconnect(struct nfsmount *nmp)
419 struct socket *so;
421 if (nmp->nm_so) {
422 so = nmp->nm_so;
423 nmp->nm_so = (struct socket *)0;
424 soshutdown(so, SHUT_RDWR);
425 soclose(so, FNONBLOCK);
429 void
430 nfs_safedisconnect(struct nfsmount *nmp)
432 struct nfsreq dummyreq;
434 bzero(&dummyreq, sizeof(dummyreq));
435 dummyreq.r_nmp = nmp;
436 dummyreq.r_td = NULL;
437 nfs_rcvlock(&dummyreq);
438 nfs_disconnect(nmp);
439 nfs_rcvunlock(&dummyreq);
443 * This is the nfs send routine. For connection based socket types, it
444 * must be called with an nfs_sndlock() on the socket.
445 * "rep == NULL" indicates that it has been called from a server.
446 * For the client side:
447 * - return EINTR if the RPC is terminated, 0 otherwise
448 * - set R_MUSTRESEND if the send fails for any reason
449 * - do any cleanup required by recoverable socket errors (?)
450 * For the server side:
451 * - return EINTR or ERESTART if interrupted by a signal
452 * - return EPIPE if a connection is lost for connection based sockets (TCP...)
453 * - do any cleanup required by recoverable socket errors (?)
456 nfs_send(struct socket *so, struct sockaddr *nam, struct mbuf *top,
457 struct nfsreq *rep)
459 struct sockaddr *sendnam;
460 int error, soflags, flags;
462 if (rep) {
463 if (rep->r_flags & R_SOFTTERM) {
464 m_freem(top);
465 return (EINTR);
467 if ((so = rep->r_nmp->nm_so) == NULL) {
468 rep->r_flags |= R_MUSTRESEND;
469 m_freem(top);
470 return (0);
472 rep->r_flags &= ~R_MUSTRESEND;
473 soflags = rep->r_nmp->nm_soflags;
474 } else
475 soflags = so->so_proto->pr_flags;
476 if ((soflags & PR_CONNREQUIRED) || (so->so_state & SS_ISCONNECTED))
477 sendnam = (struct sockaddr *)0;
478 else
479 sendnam = nam;
480 if (so->so_type == SOCK_SEQPACKET)
481 flags = MSG_EOR;
482 else
483 flags = 0;
485 error = so_pru_sosend(so, sendnam, NULL, top, NULL, flags,
486 curthread /*XXX*/);
488 * ENOBUFS for dgram sockets is transient and non fatal.
489 * No need to log, and no need to break a soft mount.
491 if (error == ENOBUFS && so->so_type == SOCK_DGRAM) {
492 error = 0;
493 if (rep) /* do backoff retransmit on client */
494 rep->r_flags |= R_MUSTRESEND;
497 if (error) {
498 if (rep) {
499 log(LOG_INFO, "nfs send error %d for server %s\n",error,
500 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
502 * Deal with errors for the client side.
504 if (rep->r_flags & R_SOFTTERM)
505 error = EINTR;
506 else
507 rep->r_flags |= R_MUSTRESEND;
508 } else
509 log(LOG_INFO, "nfsd send error %d\n", error);
512 * Handle any recoverable (soft) socket errors here. (?)
514 if (error != EINTR && error != ERESTART &&
515 error != EWOULDBLOCK && error != EPIPE)
516 error = 0;
518 return (error);
522 * Receive a Sun RPC Request/Reply. For SOCK_DGRAM, the work is all
523 * done by soreceive(), but for SOCK_STREAM we must deal with the Record
524 * Mark and consolidate the data into a new mbuf list.
525 * nb: Sometimes TCP passes the data up to soreceive() in long lists of
526 * small mbufs.
527 * For SOCK_STREAM we must be very careful to read an entire record once
528 * we have read any of it, even if the system call has been interrupted.
530 static int
531 nfs_receive(struct nfsreq *rep, struct sockaddr **aname, struct mbuf **mp)
533 struct socket *so;
534 struct sockbuf sio;
535 struct uio auio;
536 struct iovec aio;
537 struct mbuf *m;
538 struct mbuf *control;
539 u_int32_t len;
540 struct sockaddr **getnam;
541 int error, sotype, rcvflg;
542 struct thread *td = curthread; /* XXX */
545 * Set up arguments for soreceive()
547 *mp = NULL;
548 *aname = NULL;
549 sotype = rep->r_nmp->nm_sotype;
552 * For reliable protocols, lock against other senders/receivers
553 * in case a reconnect is necessary.
554 * For SOCK_STREAM, first get the Record Mark to find out how much
555 * more there is to get.
556 * We must lock the socket against other receivers
557 * until we have an entire rpc request/reply.
559 if (sotype != SOCK_DGRAM) {
560 error = nfs_sndlock(rep);
561 if (error)
562 return (error);
563 tryagain:
565 * Check for fatal errors and resending request.
568 * Ugh: If a reconnect attempt just happened, nm_so
569 * would have changed. NULL indicates a failed
570 * attempt that has essentially shut down this
571 * mount point.
573 if (rep->r_mrep || (rep->r_flags & R_SOFTTERM)) {
574 nfs_sndunlock(rep);
575 return (EINTR);
577 so = rep->r_nmp->nm_so;
578 if (!so) {
579 error = nfs_reconnect(rep);
580 if (error) {
581 nfs_sndunlock(rep);
582 return (error);
584 goto tryagain;
586 while (rep->r_flags & R_MUSTRESEND) {
587 m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT);
588 nfsstats.rpcretries++;
589 error = nfs_send(so, rep->r_nmp->nm_nam, m, rep);
590 if (error) {
591 if (error == EINTR || error == ERESTART ||
592 (error = nfs_reconnect(rep)) != 0) {
593 nfs_sndunlock(rep);
594 return (error);
596 goto tryagain;
599 nfs_sndunlock(rep);
600 if (sotype == SOCK_STREAM) {
602 * Get the length marker from the stream
604 aio.iov_base = (caddr_t)&len;
605 aio.iov_len = sizeof(u_int32_t);
606 auio.uio_iov = &aio;
607 auio.uio_iovcnt = 1;
608 auio.uio_segflg = UIO_SYSSPACE;
609 auio.uio_rw = UIO_READ;
610 auio.uio_offset = 0;
611 auio.uio_resid = sizeof(u_int32_t);
612 auio.uio_td = td;
613 do {
614 rcvflg = MSG_WAITALL;
615 error = so_pru_soreceive(so, NULL, &auio, NULL,
616 NULL, &rcvflg);
617 if (error == EWOULDBLOCK && rep) {
618 if (rep->r_flags & R_SOFTTERM)
619 return (EINTR);
621 } while (error == EWOULDBLOCK);
623 if (error == 0 && auio.uio_resid > 0) {
625 * Only log short packets if not EOF
627 if (auio.uio_resid != sizeof(u_int32_t))
628 log(LOG_INFO,
629 "short receive (%d/%d) from nfs server %s\n",
630 (int)(sizeof(u_int32_t) - auio.uio_resid),
631 (int)sizeof(u_int32_t),
632 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
633 error = EPIPE;
635 if (error)
636 goto errout;
637 len = ntohl(len) & ~0x80000000;
639 * This is SERIOUS! We are out of sync with the sender
640 * and forcing a disconnect/reconnect is all I can do.
642 if (len > NFS_MAXPACKET) {
643 log(LOG_ERR, "%s (%d) from nfs server %s\n",
644 "impossible packet length",
645 len,
646 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
647 error = EFBIG;
648 goto errout;
652 * Get the rest of the packet as an mbuf chain
654 sbinit(&sio, len);
655 do {
656 rcvflg = MSG_WAITALL;
657 error = so_pru_soreceive(so, NULL, NULL, &sio,
658 NULL, &rcvflg);
659 } while (error == EWOULDBLOCK || error == EINTR ||
660 error == ERESTART);
661 if (error == 0 && sio.sb_cc != len) {
662 if (sio.sb_cc != 0)
663 log(LOG_INFO,
664 "short receive (%d/%d) from nfs server %s\n",
665 len - auio.uio_resid, len,
666 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
667 error = EPIPE;
669 *mp = sio.sb_mb;
670 } else {
672 * Non-stream, so get the whole packet by not
673 * specifying MSG_WAITALL and by specifying a large
674 * length.
676 * We have no use for control msg., but must grab them
677 * and then throw them away so we know what is going
678 * on.
680 sbinit(&sio, 100000000);
681 do {
682 rcvflg = 0;
683 error = so_pru_soreceive(so, NULL, NULL, &sio,
684 &control, &rcvflg);
685 if (control)
686 m_freem(control);
687 if (error == EWOULDBLOCK && rep) {
688 if (rep->r_flags & R_SOFTTERM) {
689 m_freem(sio.sb_mb);
690 return (EINTR);
693 } while (error == EWOULDBLOCK ||
694 (error == 0 && sio.sb_mb == NULL && control));
695 if ((rcvflg & MSG_EOR) == 0)
696 kprintf("Egad!!\n");
697 if (error == 0 && sio.sb_mb == NULL)
698 error = EPIPE;
699 len = sio.sb_cc;
700 *mp = sio.sb_mb;
702 errout:
703 if (error && error != EINTR && error != ERESTART) {
704 m_freem(*mp);
705 *mp = NULL;
706 if (error != EPIPE) {
707 log(LOG_INFO,
708 "receive error %d from nfs server %s\n",
709 error,
710 rep->r_nmp->nm_mountp->mnt_stat.f_mntfromname);
712 error = nfs_sndlock(rep);
713 if (!error) {
714 error = nfs_reconnect(rep);
715 if (!error)
716 goto tryagain;
717 else
718 nfs_sndunlock(rep);
721 } else {
722 if ((so = rep->r_nmp->nm_so) == NULL)
723 return (EACCES);
724 if (so->so_state & SS_ISCONNECTED)
725 getnam = NULL;
726 else
727 getnam = aname;
728 sbinit(&sio, 100000000);
729 do {
730 rcvflg = 0;
731 error = so_pru_soreceive(so, getnam, NULL, &sio,
732 NULL, &rcvflg);
733 if (error == EWOULDBLOCK &&
734 (rep->r_flags & R_SOFTTERM)) {
735 m_freem(sio.sb_mb);
736 return (EINTR);
738 } while (error == EWOULDBLOCK);
739 len = sio.sb_cc;
740 *mp = sio.sb_mb;
742 if (error) {
743 m_freem(*mp);
744 *mp = NULL;
747 * Search for any mbufs that are not a multiple of 4 bytes long
748 * or with m_data not longword aligned.
749 * These could cause pointer alignment problems, so copy them to
750 * well aligned mbufs.
752 nfs_realign(mp, 5 * NFSX_UNSIGNED);
753 return (error);
757 * Implement receipt of reply on a socket.
758 * We must search through the list of received datagrams matching them
759 * with outstanding requests using the xid, until ours is found.
761 /* ARGSUSED */
763 nfs_reply(struct nfsreq *myrep)
765 struct nfsreq *rep;
766 struct nfsmount *nmp = myrep->r_nmp;
767 int32_t t1;
768 struct mbuf *mrep, *md;
769 struct sockaddr *nam;
770 u_int32_t rxid, *tl;
771 caddr_t dpos, cp2;
772 int error;
775 * Loop around until we get our own reply
777 for (;;) {
779 * Lock against other receivers so that I don't get stuck in
780 * sbwait() after someone else has received my reply for me.
781 * Also necessary for connection based protocols to avoid
782 * race conditions during a reconnect.
783 * If nfs_rcvlock() returns EALREADY, that means that
784 * the reply has already been recieved by another
785 * process and we can return immediately. In this
786 * case, the lock is not taken to avoid races with
787 * other processes.
789 error = nfs_rcvlock(myrep);
790 if (error == EALREADY)
791 return (0);
792 if (error)
793 return (error);
795 * Get the next Rpc reply off the socket
797 error = nfs_receive(myrep, &nam, &mrep);
798 nfs_rcvunlock(myrep);
799 if (error) {
801 * Ignore routing errors on connectionless protocols??
803 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) {
804 nmp->nm_so->so_error = 0;
805 if (myrep->r_flags & R_GETONEREP)
806 return (0);
807 continue;
809 return (error);
811 if (nam)
812 FREE(nam, M_SONAME);
815 * Get the xid and check that it is an rpc reply
817 md = mrep;
818 dpos = mtod(md, caddr_t);
819 nfsm_dissect(tl, u_int32_t *, 2*NFSX_UNSIGNED);
820 rxid = *tl++;
821 if (*tl != rpc_reply) {
822 nfsstats.rpcinvalid++;
823 m_freem(mrep);
824 nfsmout:
825 if (myrep->r_flags & R_GETONEREP)
826 return (0);
827 continue;
831 * Loop through the request list to match up the reply
832 * Iff no match, just drop the datagram. On match, set
833 * r_mrep atomically to prevent the timer from messing
834 * around with the request after we have exited the critical
835 * section.
837 crit_enter();
838 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
839 if (rep->r_mrep == NULL && rxid == rep->r_xid) {
840 rep->r_mrep = mrep;
841 break;
844 crit_exit();
847 * Fill in the rest of the reply if we found a match.
849 if (rep) {
850 rep->r_md = md;
851 rep->r_dpos = dpos;
852 if (nfsrtton) {
853 struct rttl *rt;
855 rt = &nfsrtt.rttl[nfsrtt.pos];
856 rt->proc = rep->r_procnum;
857 rt->rto = NFS_RTO(nmp, proct[rep->r_procnum]);
858 rt->sent = nmp->nm_sent;
859 rt->cwnd = nmp->nm_cwnd;
860 rt->srtt = nmp->nm_srtt[proct[rep->r_procnum] - 1];
861 rt->sdrtt = nmp->nm_sdrtt[proct[rep->r_procnum] - 1];
862 rt->fsid = nmp->nm_mountp->mnt_stat.f_fsid;
863 getmicrotime(&rt->tstamp);
864 if (rep->r_flags & R_TIMING)
865 rt->rtt = rep->r_rtt;
866 else
867 rt->rtt = 1000000;
868 nfsrtt.pos = (nfsrtt.pos + 1) % NFSRTTLOGSIZ;
871 * Update congestion window.
872 * Do the additive increase of
873 * one rpc/rtt.
875 if (nmp->nm_cwnd <= nmp->nm_sent) {
876 nmp->nm_cwnd +=
877 (NFS_CWNDSCALE * NFS_CWNDSCALE +
878 (nmp->nm_cwnd >> 1)) / nmp->nm_cwnd;
879 if (nmp->nm_cwnd > NFS_MAXCWND)
880 nmp->nm_cwnd = NFS_MAXCWND;
882 crit_enter(); /* nfs_timer interlock for nm_sent */
883 if (rep->r_flags & R_SENT) {
884 rep->r_flags &= ~R_SENT;
885 nmp->nm_sent -= NFS_CWNDSCALE;
887 crit_exit();
889 * Update rtt using a gain of 0.125 on the mean
890 * and a gain of 0.25 on the deviation.
892 if (rep->r_flags & R_TIMING) {
894 * Since the timer resolution of
895 * NFS_HZ is so course, it can often
896 * result in r_rtt == 0. Since
897 * r_rtt == N means that the actual
898 * rtt is between N+dt and N+2-dt ticks,
899 * add 1.
901 t1 = rep->r_rtt + 1;
902 t1 -= (NFS_SRTT(rep) >> 3);
903 NFS_SRTT(rep) += t1;
904 if (t1 < 0)
905 t1 = -t1;
906 t1 -= (NFS_SDRTT(rep) >> 2);
907 NFS_SDRTT(rep) += t1;
909 nmp->nm_timeouts = 0;
912 * If not matched to a request, drop it.
913 * If it's mine, get out.
915 if (rep == NULL) {
916 nfsstats.rpcunexpected++;
917 m_freem(mrep);
918 } else if (rep == myrep) {
919 if (rep->r_mrep == NULL)
920 panic("nfsreply nil");
921 return (0);
923 if (myrep->r_flags & R_GETONEREP)
924 return (0);
929 * nfs_request - goes something like this
930 * - fill in request struct
931 * - links it into list
932 * - calls nfs_send() for first transmit
933 * - calls nfs_receive() to get reply
934 * - break down rpc header and return with nfs reply pointed to
935 * by mrep or error
936 * nb: always frees up mreq mbuf list
939 nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum,
940 struct thread *td, struct ucred *cred, struct mbuf **mrp,
941 struct mbuf **mdp, caddr_t *dposp)
943 struct mbuf *mrep, *m2;
944 struct nfsreq *rep;
945 u_int32_t *tl;
946 int i;
947 struct nfsmount *nmp;
948 struct mbuf *m, *md, *mheadend;
949 char nickv[RPCX_NICKVERF];
950 time_t waituntil;
951 caddr_t dpos, cp2;
952 int t1, error = 0, mrest_len, auth_len, auth_type;
953 int trylater_delay = 15, trylater_cnt = 0, failed_auth = 0;
954 int verf_len, verf_type;
955 u_int32_t xid;
956 char *auth_str, *verf_str;
957 NFSKERBKEY_T key; /* save session key */
959 /* Reject requests while attempting a forced unmount. */
960 if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) {
961 m_freem(mrest);
962 return (ESTALE);
964 nmp = VFSTONFS(vp->v_mount);
965 MALLOC(rep, struct nfsreq *, sizeof(struct nfsreq), M_NFSREQ, M_WAITOK);
966 rep->r_nmp = nmp;
967 rep->r_vp = vp;
968 rep->r_td = td;
969 rep->r_procnum = procnum;
970 rep->r_mreq = NULL;
971 i = 0;
972 m = mrest;
973 while (m) {
974 i += m->m_len;
975 m = m->m_next;
977 mrest_len = i;
980 * Get the RPC header with authorization.
982 kerbauth:
983 verf_str = auth_str = (char *)0;
984 if (nmp->nm_flag & NFSMNT_KERB) {
985 verf_str = nickv;
986 verf_len = sizeof (nickv);
987 auth_type = RPCAUTH_KERB4;
988 bzero((caddr_t)key, sizeof (key));
989 if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str,
990 &auth_len, verf_str, verf_len)) {
991 error = nfs_getauth(nmp, rep, cred, &auth_str,
992 &auth_len, verf_str, &verf_len, key);
993 if (error) {
994 kfree((caddr_t)rep, M_NFSREQ);
995 m_freem(mrest);
996 return (error);
999 } else {
1000 auth_type = RPCAUTH_UNIX;
1001 if (cred->cr_ngroups < 1)
1002 panic("nfsreq nogrps");
1003 auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ?
1004 nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) +
1005 5 * NFSX_UNSIGNED;
1007 m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len,
1008 auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid);
1009 if (auth_str)
1010 kfree(auth_str, M_TEMP);
1013 * For stream protocols, insert a Sun RPC Record Mark.
1015 if (nmp->nm_sotype == SOCK_STREAM) {
1016 M_PREPEND(m, NFSX_UNSIGNED, MB_WAIT);
1017 if (m == NULL) {
1018 kfree(rep, M_NFSREQ);
1019 return (ENOBUFS);
1021 *mtod(m, u_int32_t *) = htonl(0x80000000 |
1022 (m->m_pkthdr.len - NFSX_UNSIGNED));
1024 rep->r_mreq = m;
1025 rep->r_xid = xid;
1026 tryagain:
1027 if (nmp->nm_flag & NFSMNT_SOFT)
1028 rep->r_retry = nmp->nm_retry;
1029 else
1030 rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */
1031 rep->r_rtt = rep->r_rexmit = 0;
1032 if (proct[procnum] > 0)
1033 rep->r_flags = R_TIMING | R_MASKTIMER;
1034 else
1035 rep->r_flags = R_MASKTIMER;
1036 rep->r_mrep = NULL;
1039 * Do the client side RPC.
1041 nfsstats.rpcrequests++;
1044 * Chain request into list of outstanding requests. Be sure
1045 * to put it LAST so timer finds oldest requests first. Note
1046 * that R_MASKTIMER is set at the moment to prevent any timer
1047 * action on this request while we are still doing processing on
1048 * it below. splsoftclock() primarily protects nm_sent. Note
1049 * that we may block in this code so there is no atomicy guarentee.
1051 crit_enter();
1052 TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain);
1055 * If backing off another request or avoiding congestion, don't
1056 * send this one now but let timer do it. If not timing a request,
1057 * do it now.
1059 * Even though the timer will not mess with our request there is
1060 * still the possibility that we will race a reply (which clears
1061 * R_SENT), especially on localhost connections, so be very careful
1062 * when setting R_SENT. We could set R_SENT prior to calling
1063 * nfs_send() but why bother if the response occurs that quickly?
1065 if (nmp->nm_so && (nmp->nm_sotype != SOCK_DGRAM ||
1066 (nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1067 nmp->nm_sent < nmp->nm_cwnd)) {
1068 if (nmp->nm_soflags & PR_CONNREQUIRED)
1069 error = nfs_sndlock(rep);
1070 if (!error) {
1071 m2 = m_copym(m, 0, M_COPYALL, MB_WAIT);
1072 error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep);
1073 if (nmp->nm_soflags & PR_CONNREQUIRED)
1074 nfs_sndunlock(rep);
1076 if (!error && (rep->r_flags & R_MUSTRESEND) == 0 &&
1077 rep->r_mrep == NULL) {
1078 KASSERT((rep->r_flags & R_SENT) == 0,
1079 ("R_SENT ASSERT %p", rep));
1080 nmp->nm_sent += NFS_CWNDSCALE;
1081 rep->r_flags |= R_SENT;
1083 } else {
1084 rep->r_rtt = -1;
1088 * Let the timer do what it will with the request, then
1089 * wait for the reply from our send or the timer's.
1091 if (!error || error == EPIPE) {
1092 rep->r_flags &= ~R_MASKTIMER;
1093 crit_exit();
1094 error = nfs_reply(rep);
1095 crit_enter();
1099 * RPC done, unlink the request, but don't rip it out from under
1100 * the callout timer.
1102 while (rep->r_flags & R_LOCKED) {
1103 nfs_timer_raced = 1;
1104 tsleep(&nfs_timer_raced, 0, "nfstrac", 0);
1106 TAILQ_REMOVE(&nfs_reqq, rep, r_chain);
1109 * Decrement the outstanding request count.
1111 if (rep->r_flags & R_SENT) {
1112 rep->r_flags &= ~R_SENT;
1113 nmp->nm_sent -= NFS_CWNDSCALE;
1115 crit_exit();
1118 * If there was a successful reply and a tprintf msg.
1119 * tprintf a response.
1121 if (!error && (rep->r_flags & R_TPRINTFMSG))
1122 nfs_msg(rep->r_td, nmp->nm_mountp->mnt_stat.f_mntfromname,
1123 "is alive again");
1124 mrep = rep->r_mrep;
1125 md = rep->r_md;
1126 dpos = rep->r_dpos;
1127 if (error) {
1128 m_freem(rep->r_mreq);
1129 kfree((caddr_t)rep, M_NFSREQ);
1130 return (error);
1134 * break down the rpc header and check if ok
1136 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1137 if (*tl++ == rpc_msgdenied) {
1138 if (*tl == rpc_mismatch)
1139 error = EOPNOTSUPP;
1140 else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) {
1141 if (!failed_auth) {
1142 failed_auth++;
1143 mheadend->m_next = (struct mbuf *)0;
1144 m_freem(mrep);
1145 m_freem(rep->r_mreq);
1146 goto kerbauth;
1147 } else
1148 error = EAUTH;
1149 } else
1150 error = EACCES;
1151 m_freem(mrep);
1152 m_freem(rep->r_mreq);
1153 kfree((caddr_t)rep, M_NFSREQ);
1154 return (error);
1158 * Grab any Kerberos verifier, otherwise just throw it away.
1160 verf_type = fxdr_unsigned(int, *tl++);
1161 i = fxdr_unsigned(int32_t, *tl);
1162 if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) {
1163 error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep);
1164 if (error)
1165 goto nfsmout;
1166 } else if (i > 0)
1167 nfsm_adv(nfsm_rndup(i));
1168 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1169 /* 0 == ok */
1170 if (*tl == 0) {
1171 nfsm_dissect(tl, u_int32_t *, NFSX_UNSIGNED);
1172 if (*tl != 0) {
1173 error = fxdr_unsigned(int, *tl);
1174 if ((nmp->nm_flag & NFSMNT_NFSV3) &&
1175 error == NFSERR_TRYLATER) {
1176 m_freem(mrep);
1177 error = 0;
1178 waituntil = time_second + trylater_delay;
1179 while (time_second < waituntil)
1180 (void) tsleep((caddr_t)&lbolt,
1181 0, "nqnfstry", 0);
1182 trylater_delay *= nfs_backoff[trylater_cnt];
1183 if (trylater_cnt < 7)
1184 trylater_cnt++;
1185 goto tryagain;
1189 * If the File Handle was stale, invalidate the
1190 * lookup cache, just in case.
1192 * To avoid namecache<->vnode deadlocks we must
1193 * release the vnode lock if we hold it.
1195 if (error == ESTALE) {
1196 int ltype;
1198 ltype = lockstatus(&vp->v_lock, curthread);
1199 if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED)
1200 lockmgr(&vp->v_lock, LK_RELEASE);
1201 cache_inval_vp(vp, CINV_CHILDREN);
1202 if (ltype == LK_EXCLUSIVE || ltype == LK_SHARED)
1203 lockmgr(&vp->v_lock, ltype);
1205 if (nmp->nm_flag & NFSMNT_NFSV3) {
1206 *mrp = mrep;
1207 *mdp = md;
1208 *dposp = dpos;
1209 error |= NFSERR_RETERR;
1210 } else
1211 m_freem(mrep);
1212 m_freem(rep->r_mreq);
1213 kfree((caddr_t)rep, M_NFSREQ);
1214 return (error);
1217 *mrp = mrep;
1218 *mdp = md;
1219 *dposp = dpos;
1220 m_freem(rep->r_mreq);
1221 FREE((caddr_t)rep, M_NFSREQ);
1222 return (0);
1224 m_freem(mrep);
1225 error = EPROTONOSUPPORT;
1226 nfsmout:
1227 m_freem(rep->r_mreq);
1228 kfree((caddr_t)rep, M_NFSREQ);
1229 return (error);
1232 #ifndef NFS_NOSERVER
1234 * Generate the rpc reply header
1235 * siz arg. is used to decide if adding a cluster is worthwhile
1238 nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp,
1239 int err, struct mbuf **mrq, struct mbuf **mbp, caddr_t *bposp)
1241 u_int32_t *tl;
1242 struct mbuf *mreq;
1243 caddr_t bpos;
1244 struct mbuf *mb, *mb2;
1246 siz += RPC_REPLYSIZ;
1247 mb = mreq = m_getl(max_hdr + siz, MB_WAIT, MT_DATA, M_PKTHDR, NULL);
1248 mreq->m_pkthdr.len = 0;
1250 * If this is not a cluster, try and leave leading space
1251 * for the lower level headers.
1253 if ((max_hdr + siz) < MINCLSIZE)
1254 mreq->m_data += max_hdr;
1255 tl = mtod(mreq, u_int32_t *);
1256 mreq->m_len = 6 * NFSX_UNSIGNED;
1257 bpos = ((caddr_t)tl) + mreq->m_len;
1258 *tl++ = txdr_unsigned(nd->nd_retxid);
1259 *tl++ = rpc_reply;
1260 if (err == ERPCMISMATCH || (err & NFSERR_AUTHERR)) {
1261 *tl++ = rpc_msgdenied;
1262 if (err & NFSERR_AUTHERR) {
1263 *tl++ = rpc_autherr;
1264 *tl = txdr_unsigned(err & ~NFSERR_AUTHERR);
1265 mreq->m_len -= NFSX_UNSIGNED;
1266 bpos -= NFSX_UNSIGNED;
1267 } else {
1268 *tl++ = rpc_mismatch;
1269 *tl++ = txdr_unsigned(RPC_VER2);
1270 *tl = txdr_unsigned(RPC_VER2);
1272 } else {
1273 *tl++ = rpc_msgaccepted;
1276 * For Kerberos authentication, we must send the nickname
1277 * verifier back, otherwise just RPCAUTH_NULL.
1279 if (nd->nd_flag & ND_KERBFULL) {
1280 struct nfsuid *nuidp;
1281 struct timeval ktvin, ktvout;
1283 for (nuidp = NUIDHASH(slp, nd->nd_cr.cr_uid)->lh_first;
1284 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1285 if (nuidp->nu_cr.cr_uid == nd->nd_cr.cr_uid &&
1286 (!nd->nd_nam2 || netaddr_match(NU_NETFAM(nuidp),
1287 &nuidp->nu_haddr, nd->nd_nam2)))
1288 break;
1290 if (nuidp) {
1291 ktvin.tv_sec =
1292 txdr_unsigned(nuidp->nu_timestamp.tv_sec - 1);
1293 ktvin.tv_usec =
1294 txdr_unsigned(nuidp->nu_timestamp.tv_usec);
1297 * Encrypt the timestamp in ecb mode using the
1298 * session key.
1300 #ifdef NFSKERB
1302 #endif
1304 *tl++ = rpc_auth_kerb;
1305 *tl++ = txdr_unsigned(3 * NFSX_UNSIGNED);
1306 *tl = ktvout.tv_sec;
1307 nfsm_build(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1308 *tl++ = ktvout.tv_usec;
1309 *tl++ = txdr_unsigned(nuidp->nu_cr.cr_uid);
1310 } else {
1311 *tl++ = 0;
1312 *tl++ = 0;
1314 } else {
1315 *tl++ = 0;
1316 *tl++ = 0;
1318 switch (err) {
1319 case EPROGUNAVAIL:
1320 *tl = txdr_unsigned(RPC_PROGUNAVAIL);
1321 break;
1322 case EPROGMISMATCH:
1323 *tl = txdr_unsigned(RPC_PROGMISMATCH);
1324 nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1325 *tl++ = txdr_unsigned(2);
1326 *tl = txdr_unsigned(3);
1327 break;
1328 case EPROCUNAVAIL:
1329 *tl = txdr_unsigned(RPC_PROCUNAVAIL);
1330 break;
1331 case EBADRPC:
1332 *tl = txdr_unsigned(RPC_GARBAGE);
1333 break;
1334 default:
1335 *tl = 0;
1336 if (err != NFSERR_RETVOID) {
1337 nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED);
1338 if (err)
1339 *tl = txdr_unsigned(nfsrv_errmap(nd, err));
1340 else
1341 *tl = 0;
1343 break;
1347 if (mrq != NULL)
1348 *mrq = mreq;
1349 *mbp = mb;
1350 *bposp = bpos;
1351 if (err != 0 && err != NFSERR_RETVOID)
1352 nfsstats.srvrpc_errs++;
1353 return (0);
1357 #endif /* NFS_NOSERVER */
1359 * Nfs timer routine
1360 * Scan the nfsreq list and retranmit any requests that have timed out
1361 * To avoid retransmission attempts on STREAM sockets (in the future) make
1362 * sure to set the r_retry field to 0 (implies nm_retry == 0).
1364 void
1365 nfs_timer(void *arg /* never used */)
1367 struct nfsreq *rep;
1368 struct mbuf *m;
1369 struct socket *so;
1370 struct nfsmount *nmp;
1371 int timeo;
1372 int error;
1373 #ifndef NFS_NOSERVER
1374 struct nfssvc_sock *slp;
1375 u_quad_t cur_usec;
1376 #endif /* NFS_NOSERVER */
1377 struct thread *td = &thread0; /* XXX for credentials, will break if sleep */
1379 crit_enter();
1380 TAILQ_FOREACH(rep, &nfs_reqq, r_chain) {
1381 nmp = rep->r_nmp;
1382 if (rep->r_mrep || (rep->r_flags & (R_SOFTTERM|R_MASKTIMER)))
1383 continue;
1384 rep->r_flags |= R_LOCKED;
1385 if (nfs_sigintr(nmp, rep, rep->r_td)) {
1386 nfs_softterm(rep);
1387 goto skip;
1389 if (rep->r_rtt >= 0) {
1390 rep->r_rtt++;
1391 if (nmp->nm_flag & NFSMNT_DUMBTIMR)
1392 timeo = nmp->nm_timeo;
1393 else
1394 timeo = NFS_RTO(nmp, proct[rep->r_procnum]);
1395 if (nmp->nm_timeouts > 0)
1396 timeo *= nfs_backoff[nmp->nm_timeouts - 1];
1397 if (rep->r_rtt <= timeo)
1398 goto skip;
1399 if (nmp->nm_timeouts < 8)
1400 nmp->nm_timeouts++;
1403 * Check for server not responding
1405 if ((rep->r_flags & R_TPRINTFMSG) == 0 &&
1406 rep->r_rexmit > nmp->nm_deadthresh) {
1407 nfs_msg(rep->r_td,
1408 nmp->nm_mountp->mnt_stat.f_mntfromname,
1409 "not responding");
1410 rep->r_flags |= R_TPRINTFMSG;
1412 if (rep->r_rexmit >= rep->r_retry) { /* too many */
1413 nfsstats.rpctimeouts++;
1414 nfs_softterm(rep);
1415 goto skip;
1417 if (nmp->nm_sotype != SOCK_DGRAM) {
1418 if (++rep->r_rexmit > NFS_MAXREXMIT)
1419 rep->r_rexmit = NFS_MAXREXMIT;
1420 goto skip;
1422 if ((so = nmp->nm_so) == NULL)
1423 goto skip;
1426 * If there is enough space and the window allows..
1427 * Resend it
1428 * Set r_rtt to -1 in case we fail to send it now.
1430 rep->r_rtt = -1;
1431 if (ssb_space(&so->so_snd) >= rep->r_mreq->m_pkthdr.len &&
1432 ((nmp->nm_flag & NFSMNT_DUMBTIMR) ||
1433 (rep->r_flags & R_SENT) ||
1434 nmp->nm_sent < nmp->nm_cwnd) &&
1435 (m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){
1436 if ((nmp->nm_flag & NFSMNT_NOCONN) == 0)
1437 error = so_pru_send(so, 0, m, (struct sockaddr *)0,
1438 (struct mbuf *)0, td);
1439 else
1440 error = so_pru_send(so, 0, m, nmp->nm_nam,
1441 (struct mbuf *)0, td);
1442 if (error) {
1443 if (NFSIGNORE_SOERROR(nmp->nm_soflags, error))
1444 so->so_error = 0;
1445 } else if (rep->r_mrep == NULL) {
1447 * Iff first send, start timing
1448 * else turn timing off, backoff timer
1449 * and divide congestion window by 2.
1451 * It is possible for the so_pru_send() to
1452 * block and for us to race a reply so we
1453 * only do this if the reply field has not
1454 * been filled in. R_LOCKED will prevent
1455 * the request from being ripped out from under
1456 * us entirely.
1458 if (rep->r_flags & R_SENT) {
1459 rep->r_flags &= ~R_TIMING;
1460 if (++rep->r_rexmit > NFS_MAXREXMIT)
1461 rep->r_rexmit = NFS_MAXREXMIT;
1462 nmp->nm_cwnd >>= 1;
1463 if (nmp->nm_cwnd < NFS_CWNDSCALE)
1464 nmp->nm_cwnd = NFS_CWNDSCALE;
1465 nfsstats.rpcretries++;
1466 } else {
1467 rep->r_flags |= R_SENT;
1468 nmp->nm_sent += NFS_CWNDSCALE;
1470 rep->r_rtt = 0;
1473 skip:
1474 rep->r_flags &= ~R_LOCKED;
1476 #ifndef NFS_NOSERVER
1479 * Scan the write gathering queues for writes that need to be
1480 * completed now.
1482 cur_usec = nfs_curusec();
1483 TAILQ_FOREACH(slp, &nfssvc_sockhead, ns_chain) {
1484 if (slp->ns_tq.lh_first && slp->ns_tq.lh_first->nd_time<=cur_usec)
1485 nfsrv_wakenfsd(slp, 1);
1487 #endif /* NFS_NOSERVER */
1490 * Due to possible blocking, a client operation may be waiting for
1491 * us to finish processing this request so it can remove it.
1493 if (nfs_timer_raced) {
1494 nfs_timer_raced = 0;
1495 wakeup(&nfs_timer_raced);
1497 crit_exit();
1498 callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer, NULL);
1502 * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and
1503 * wait for all requests to complete. This is used by forced unmounts
1504 * to terminate any outstanding RPCs.
1507 nfs_nmcancelreqs(struct nfsmount *nmp)
1509 struct nfsreq *req;
1510 int i;
1512 crit_enter();
1513 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1514 if (nmp != req->r_nmp || req->r_mrep != NULL ||
1515 (req->r_flags & R_SOFTTERM)) {
1516 continue;
1518 nfs_softterm(req);
1520 crit_exit();
1522 for (i = 0; i < 30; i++) {
1523 crit_enter();
1524 TAILQ_FOREACH(req, &nfs_reqq, r_chain) {
1525 if (nmp == req->r_nmp)
1526 break;
1528 crit_exit();
1529 if (req == NULL)
1530 return (0);
1531 tsleep(&lbolt, 0, "nfscancel", 0);
1533 return (EBUSY);
1537 * Flag a request as being about to terminate (due to NFSMNT_INT/NFSMNT_SOFT).
1538 * The nm_send count is decremented now to avoid deadlocks when the process in
1539 * soreceive() hasn't yet managed to send its own request.
1541 * This routine must be called at splsoftclock() to protect r_flags and
1542 * nm_sent.
1545 static void
1546 nfs_softterm(struct nfsreq *rep)
1548 rep->r_flags |= R_SOFTTERM;
1550 if (rep->r_flags & R_SENT) {
1551 rep->r_nmp->nm_sent -= NFS_CWNDSCALE;
1552 rep->r_flags &= ~R_SENT;
1557 * Test for a termination condition pending on the process.
1558 * This is used for NFSMNT_INT mounts.
1561 nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td)
1563 sigset_t tmpset;
1564 struct proc *p;
1565 struct lwp *lp;
1567 if (rep && (rep->r_flags & R_SOFTTERM))
1568 return (EINTR);
1569 /* Terminate all requests while attempting a forced unmount. */
1570 if (nmp->nm_mountp->mnt_kern_flag & MNTK_UNMOUNTF)
1571 return (EINTR);
1572 if (!(nmp->nm_flag & NFSMNT_INT))
1573 return (0);
1574 /* td might be NULL YYY */
1575 if (td == NULL || (p = td->td_proc) == NULL)
1576 return (0);
1578 lp = td->td_lwp;
1579 tmpset = lwp_sigpend(lp);
1580 SIGSETNAND(tmpset, lp->lwp_sigmask);
1581 SIGSETNAND(tmpset, p->p_sigignore);
1582 if (SIGNOTEMPTY(tmpset) && NFSINT_SIGMASK(tmpset))
1583 return (EINTR);
1585 return (0);
1589 * Lock a socket against others.
1590 * Necessary for STREAM sockets to ensure you get an entire rpc request/reply
1591 * and also to avoid race conditions between the processes with nfs requests
1592 * in progress when a reconnect is necessary.
1595 nfs_sndlock(struct nfsreq *rep)
1597 int *statep = &rep->r_nmp->nm_state;
1598 struct thread *td;
1599 int slptimeo;
1600 int slpflag;
1601 int error;
1603 slpflag = 0;
1604 slptimeo = 0;
1605 td = rep->r_td;
1606 if (rep->r_nmp->nm_flag & NFSMNT_INT)
1607 slpflag = PCATCH;
1609 error = 0;
1610 crit_enter();
1611 while (*statep & NFSSTA_SNDLOCK) {
1612 *statep |= NFSSTA_WANTSND;
1613 if (nfs_sigintr(rep->r_nmp, rep, td)) {
1614 error = EINTR;
1615 break;
1617 tsleep((caddr_t)statep, slpflag, "nfsndlck", slptimeo);
1618 if (slpflag == PCATCH) {
1619 slpflag = 0;
1620 slptimeo = 2 * hz;
1623 /* Always fail if our request has been cancelled. */
1624 if ((rep->r_flags & R_SOFTTERM))
1625 error = EINTR;
1626 if (error == 0)
1627 *statep |= NFSSTA_SNDLOCK;
1628 crit_exit();
1629 return (error);
1633 * Unlock the stream socket for others.
1635 void
1636 nfs_sndunlock(struct nfsreq *rep)
1638 int *statep = &rep->r_nmp->nm_state;
1640 if ((*statep & NFSSTA_SNDLOCK) == 0)
1641 panic("nfs sndunlock");
1642 crit_enter();
1643 *statep &= ~NFSSTA_SNDLOCK;
1644 if (*statep & NFSSTA_WANTSND) {
1645 *statep &= ~NFSSTA_WANTSND;
1646 wakeup((caddr_t)statep);
1648 crit_exit();
1651 static int
1652 nfs_rcvlock(struct nfsreq *rep)
1654 int *statep = &rep->r_nmp->nm_state;
1655 int slpflag;
1656 int slptimeo;
1657 int error;
1660 * Unconditionally check for completion in case another nfsiod
1661 * get the packet while the caller was blocked, before the caller
1662 * called us. Packet reception is handled by mainline code which
1663 * is protected by the BGL at the moment.
1665 * We do not strictly need the second check just before the
1666 * tsleep(), but it's good defensive programming.
1668 if (rep->r_mrep != NULL)
1669 return (EALREADY);
1671 if (rep->r_nmp->nm_flag & NFSMNT_INT)
1672 slpflag = PCATCH;
1673 else
1674 slpflag = 0;
1675 slptimeo = 0;
1676 error = 0;
1677 crit_enter();
1678 while (*statep & NFSSTA_RCVLOCK) {
1679 if (nfs_sigintr(rep->r_nmp, rep, rep->r_td)) {
1680 error = EINTR;
1681 break;
1683 if (rep->r_mrep != NULL) {
1684 error = EALREADY;
1685 break;
1687 *statep |= NFSSTA_WANTRCV;
1688 tsleep((caddr_t)statep, slpflag, "nfsrcvlk", slptimeo);
1690 * If our reply was recieved while we were sleeping,
1691 * then just return without taking the lock to avoid a
1692 * situation where a single iod could 'capture' the
1693 * recieve lock.
1695 if (rep->r_mrep != NULL) {
1696 error = EALREADY;
1697 break;
1699 if (slpflag == PCATCH) {
1700 slpflag = 0;
1701 slptimeo = 2 * hz;
1704 if (error == 0) {
1705 *statep |= NFSSTA_RCVLOCK;
1706 rep->r_nmp->nm_rcvlock_td = curthread; /* DEBUGGING */
1708 crit_exit();
1709 return (error);
1713 * Unlock the stream socket for others.
1715 static void
1716 nfs_rcvunlock(struct nfsreq *rep)
1718 int *statep = &rep->r_nmp->nm_state;
1720 if ((*statep & NFSSTA_RCVLOCK) == 0)
1721 panic("nfs rcvunlock");
1722 crit_enter();
1723 rep->r_nmp->nm_rcvlock_td = (void *)-1; /* DEBUGGING */
1724 *statep &= ~NFSSTA_RCVLOCK;
1725 if (*statep & NFSSTA_WANTRCV) {
1726 *statep &= ~NFSSTA_WANTRCV;
1727 wakeup((caddr_t)statep);
1729 crit_exit();
1733 * nfs_realign:
1735 * Check for badly aligned mbuf data and realign by copying the unaligned
1736 * portion of the data into a new mbuf chain and freeing the portions
1737 * of the old chain that were replaced.
1739 * We cannot simply realign the data within the existing mbuf chain
1740 * because the underlying buffers may contain other rpc commands and
1741 * we cannot afford to overwrite them.
1743 * We would prefer to avoid this situation entirely. The situation does
1744 * not occur with NFS/UDP and is supposed to only occassionally occur
1745 * with TCP. Use vfs.nfs.realign_count and realign_test to check this.
1747 static void
1748 nfs_realign(struct mbuf **pm, int hsiz)
1750 struct mbuf *m;
1751 struct mbuf *n = NULL;
1752 int off = 0;
1754 ++nfs_realign_test;
1756 while ((m = *pm) != NULL) {
1757 if ((m->m_len & 0x3) || (mtod(m, intptr_t) & 0x3)) {
1758 n = m_getl(m->m_len, MB_WAIT, MT_DATA, 0, NULL);
1759 n->m_len = 0;
1760 break;
1762 pm = &m->m_next;
1766 * If n is non-NULL, loop on m copying data, then replace the
1767 * portion of the chain that had to be realigned.
1769 if (n != NULL) {
1770 ++nfs_realign_count;
1771 while (m) {
1772 m_copyback(n, off, m->m_len, mtod(m, caddr_t));
1773 off += m->m_len;
1774 m = m->m_next;
1776 m_freem(*pm);
1777 *pm = n;
1781 #ifndef NFS_NOSERVER
1784 * Parse an RPC request
1785 * - verify it
1786 * - fill in the cred struct.
1789 nfs_getreq(struct nfsrv_descript *nd, struct nfsd *nfsd, int has_header)
1791 int len, i;
1792 u_int32_t *tl;
1793 int32_t t1;
1794 struct uio uio;
1795 struct iovec iov;
1796 caddr_t dpos, cp2, cp;
1797 u_int32_t nfsvers, auth_type;
1798 uid_t nickuid;
1799 int error = 0, ticklen;
1800 struct mbuf *mrep, *md;
1801 struct nfsuid *nuidp;
1802 struct timeval tvin, tvout;
1803 #if 0 /* until encrypted keys are implemented */
1804 NFSKERBKEYSCHED_T keys; /* stores key schedule */
1805 #endif
1807 mrep = nd->nd_mrep;
1808 md = nd->nd_md;
1809 dpos = nd->nd_dpos;
1810 if (has_header) {
1811 nfsm_dissect(tl, u_int32_t *, 10 * NFSX_UNSIGNED);
1812 nd->nd_retxid = fxdr_unsigned(u_int32_t, *tl++);
1813 if (*tl++ != rpc_call) {
1814 m_freem(mrep);
1815 return (EBADRPC);
1817 } else
1818 nfsm_dissect(tl, u_int32_t *, 8 * NFSX_UNSIGNED);
1819 nd->nd_repstat = 0;
1820 nd->nd_flag = 0;
1821 if (*tl++ != rpc_vers) {
1822 nd->nd_repstat = ERPCMISMATCH;
1823 nd->nd_procnum = NFSPROC_NOOP;
1824 return (0);
1826 if (*tl != nfs_prog) {
1827 nd->nd_repstat = EPROGUNAVAIL;
1828 nd->nd_procnum = NFSPROC_NOOP;
1829 return (0);
1831 tl++;
1832 nfsvers = fxdr_unsigned(u_int32_t, *tl++);
1833 if (nfsvers < NFS_VER2 || nfsvers > NFS_VER3) {
1834 nd->nd_repstat = EPROGMISMATCH;
1835 nd->nd_procnum = NFSPROC_NOOP;
1836 return (0);
1838 if (nfsvers == NFS_VER3)
1839 nd->nd_flag = ND_NFSV3;
1840 nd->nd_procnum = fxdr_unsigned(u_int32_t, *tl++);
1841 if (nd->nd_procnum == NFSPROC_NULL)
1842 return (0);
1843 if (nd->nd_procnum >= NFS_NPROCS ||
1844 (nd->nd_procnum >= NQNFSPROC_GETLEASE) ||
1845 (!nd->nd_flag && nd->nd_procnum > NFSV2PROC_STATFS)) {
1846 nd->nd_repstat = EPROCUNAVAIL;
1847 nd->nd_procnum = NFSPROC_NOOP;
1848 return (0);
1850 if ((nd->nd_flag & ND_NFSV3) == 0)
1851 nd->nd_procnum = nfsv3_procid[nd->nd_procnum];
1852 auth_type = *tl++;
1853 len = fxdr_unsigned(int, *tl++);
1854 if (len < 0 || len > RPCAUTH_MAXSIZ) {
1855 m_freem(mrep);
1856 return (EBADRPC);
1859 nd->nd_flag &= ~ND_KERBAUTH;
1861 * Handle auth_unix or auth_kerb.
1863 if (auth_type == rpc_auth_unix) {
1864 len = fxdr_unsigned(int, *++tl);
1865 if (len < 0 || len > NFS_MAXNAMLEN) {
1866 m_freem(mrep);
1867 return (EBADRPC);
1869 nfsm_adv(nfsm_rndup(len));
1870 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1871 bzero((caddr_t)&nd->nd_cr, sizeof (struct ucred));
1872 nd->nd_cr.cr_ref = 1;
1873 nd->nd_cr.cr_uid = fxdr_unsigned(uid_t, *tl++);
1874 nd->nd_cr.cr_gid = fxdr_unsigned(gid_t, *tl++);
1875 len = fxdr_unsigned(int, *tl);
1876 if (len < 0 || len > RPCAUTH_UNIXGIDS) {
1877 m_freem(mrep);
1878 return (EBADRPC);
1880 nfsm_dissect(tl, u_int32_t *, (len + 2) * NFSX_UNSIGNED);
1881 for (i = 1; i <= len; i++)
1882 if (i < NGROUPS)
1883 nd->nd_cr.cr_groups[i] = fxdr_unsigned(gid_t, *tl++);
1884 else
1885 tl++;
1886 nd->nd_cr.cr_ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1);
1887 if (nd->nd_cr.cr_ngroups > 1)
1888 nfsrvw_sort(nd->nd_cr.cr_groups, nd->nd_cr.cr_ngroups);
1889 len = fxdr_unsigned(int, *++tl);
1890 if (len < 0 || len > RPCAUTH_MAXSIZ) {
1891 m_freem(mrep);
1892 return (EBADRPC);
1894 if (len > 0)
1895 nfsm_adv(nfsm_rndup(len));
1896 } else if (auth_type == rpc_auth_kerb) {
1897 switch (fxdr_unsigned(int, *tl++)) {
1898 case RPCAKN_FULLNAME:
1899 ticklen = fxdr_unsigned(int, *tl);
1900 *((u_int32_t *)nfsd->nfsd_authstr) = *tl;
1901 uio.uio_resid = nfsm_rndup(ticklen) + NFSX_UNSIGNED;
1902 nfsd->nfsd_authlen = uio.uio_resid + NFSX_UNSIGNED;
1903 if (uio.uio_resid > (len - 2 * NFSX_UNSIGNED)) {
1904 m_freem(mrep);
1905 return (EBADRPC);
1907 uio.uio_offset = 0;
1908 uio.uio_iov = &iov;
1909 uio.uio_iovcnt = 1;
1910 uio.uio_segflg = UIO_SYSSPACE;
1911 iov.iov_base = (caddr_t)&nfsd->nfsd_authstr[4];
1912 iov.iov_len = RPCAUTH_MAXSIZ - 4;
1913 nfsm_mtouio(&uio, uio.uio_resid);
1914 nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1915 if (*tl++ != rpc_auth_kerb ||
1916 fxdr_unsigned(int, *tl) != 4 * NFSX_UNSIGNED) {
1917 kprintf("Bad kerb verifier\n");
1918 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1919 nd->nd_procnum = NFSPROC_NOOP;
1920 return (0);
1922 nfsm_dissect(cp, caddr_t, 4 * NFSX_UNSIGNED);
1923 tl = (u_int32_t *)cp;
1924 if (fxdr_unsigned(int, *tl) != RPCAKN_FULLNAME) {
1925 kprintf("Not fullname kerb verifier\n");
1926 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1927 nd->nd_procnum = NFSPROC_NOOP;
1928 return (0);
1930 cp += NFSX_UNSIGNED;
1931 bcopy(cp, nfsd->nfsd_verfstr, 3 * NFSX_UNSIGNED);
1932 nfsd->nfsd_verflen = 3 * NFSX_UNSIGNED;
1933 nd->nd_flag |= ND_KERBFULL;
1934 nfsd->nfsd_flag |= NFSD_NEEDAUTH;
1935 break;
1936 case RPCAKN_NICKNAME:
1937 if (len != 2 * NFSX_UNSIGNED) {
1938 kprintf("Kerb nickname short\n");
1939 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADCRED);
1940 nd->nd_procnum = NFSPROC_NOOP;
1941 return (0);
1943 nickuid = fxdr_unsigned(uid_t, *tl);
1944 nfsm_dissect(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1945 if (*tl++ != rpc_auth_kerb ||
1946 fxdr_unsigned(int, *tl) != 3 * NFSX_UNSIGNED) {
1947 kprintf("Kerb nick verifier bad\n");
1948 nd->nd_repstat = (NFSERR_AUTHERR|AUTH_BADVERF);
1949 nd->nd_procnum = NFSPROC_NOOP;
1950 return (0);
1952 nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
1953 tvin.tv_sec = *tl++;
1954 tvin.tv_usec = *tl;
1956 for (nuidp = NUIDHASH(nfsd->nfsd_slp,nickuid)->lh_first;
1957 nuidp != 0; nuidp = nuidp->nu_hash.le_next) {
1958 if (nuidp->nu_cr.cr_uid == nickuid &&
1959 (!nd->nd_nam2 ||
1960 netaddr_match(NU_NETFAM(nuidp),
1961 &nuidp->nu_haddr, nd->nd_nam2)))
1962 break;
1964 if (!nuidp) {
1965 nd->nd_repstat =
1966 (NFSERR_AUTHERR|AUTH_REJECTCRED);
1967 nd->nd_procnum = NFSPROC_NOOP;
1968 return (0);
1972 * Now, decrypt the timestamp using the session key
1973 * and validate it.
1975 #ifdef NFSKERB
1977 #endif
1979 tvout.tv_sec = fxdr_unsigned(long, tvout.tv_sec);
1980 tvout.tv_usec = fxdr_unsigned(long, tvout.tv_usec);
1981 if (nuidp->nu_expire < time_second ||
1982 nuidp->nu_timestamp.tv_sec > tvout.tv_sec ||
1983 (nuidp->nu_timestamp.tv_sec == tvout.tv_sec &&
1984 nuidp->nu_timestamp.tv_usec > tvout.tv_usec)) {
1985 nuidp->nu_expire = 0;
1986 nd->nd_repstat =
1987 (NFSERR_AUTHERR|AUTH_REJECTVERF);
1988 nd->nd_procnum = NFSPROC_NOOP;
1989 return (0);
1991 nfsrv_setcred(&nuidp->nu_cr, &nd->nd_cr);
1992 nd->nd_flag |= ND_KERBNICK;
1994 } else {
1995 nd->nd_repstat = (NFSERR_AUTHERR | AUTH_REJECTCRED);
1996 nd->nd_procnum = NFSPROC_NOOP;
1997 return (0);
2000 nd->nd_md = md;
2001 nd->nd_dpos = dpos;
2002 return (0);
2003 nfsmout:
2004 return (error);
2007 #endif
2010 * Send a message to the originating process's terminal. The thread and/or
2011 * process may be NULL. YYY the thread should not be NULL but there may
2012 * still be some uio_td's that are still being passed as NULL through to
2013 * nfsm_request().
2015 static int
2016 nfs_msg(struct thread *td, char *server, char *msg)
2018 tpr_t tpr;
2020 if (td && td->td_proc)
2021 tpr = tprintf_open(td->td_proc);
2022 else
2023 tpr = NULL;
2024 tprintf(tpr, "nfs server %s: %s\n", server, msg);
2025 tprintf_close(tpr);
2026 return (0);
2029 #ifndef NFS_NOSERVER
2031 * Socket upcall routine for the nfsd sockets.
2032 * The caddr_t arg is a pointer to the "struct nfssvc_sock".
2033 * Essentially do as much as possible non-blocking, else punt and it will
2034 * be called with MB_WAIT from an nfsd.
2036 void
2037 nfsrv_rcv(struct socket *so, void *arg, int waitflag)
2039 struct nfssvc_sock *slp = (struct nfssvc_sock *)arg;
2040 struct mbuf *m;
2041 struct sockaddr *nam;
2042 struct sockbuf sio;
2043 int flags, error;
2044 int nparallel_wakeup = 0;
2046 if ((slp->ns_flag & SLP_VALID) == 0)
2047 return;
2050 * Do not allow an infinite number of completed RPC records to build
2051 * up before we stop reading data from the socket. Otherwise we could
2052 * end up holding onto an unreasonable number of mbufs for requests
2053 * waiting for service.
2055 * This should give pretty good feedback to the TCP
2056 * layer and prevents a memory crunch for other protocols.
2058 * Note that the same service socket can be dispatched to several
2059 * nfs servers simultaniously.
2061 * the tcp protocol callback calls us with MB_DONTWAIT.
2062 * nfsd calls us with MB_WAIT (typically).
2064 if (waitflag == MB_DONTWAIT && slp->ns_numrec >= nfsd_waiting / 2 + 1) {
2065 slp->ns_flag |= SLP_NEEDQ;
2066 goto dorecs;
2070 * Handle protocol specifics to parse an RPC request. We always
2071 * pull from the socket using non-blocking I/O.
2073 if (so->so_type == SOCK_STREAM) {
2075 * The data has to be read in an orderly fashion from a TCP
2076 * stream, unlike a UDP socket. It is possible for soreceive
2077 * and/or nfsrv_getstream() to block, so make sure only one
2078 * entity is messing around with the TCP stream at any given
2079 * moment. The receive sockbuf's lock in soreceive is not
2080 * sufficient.
2082 * Note that this procedure can be called from any number of
2083 * NFS severs *OR* can be upcalled directly from a TCP
2084 * protocol thread.
2086 if (slp->ns_flag & SLP_GETSTREAM) {
2087 slp->ns_flag |= SLP_NEEDQ;
2088 goto dorecs;
2090 slp->ns_flag |= SLP_GETSTREAM;
2093 * Do soreceive(). Pull out as much data as possible without
2094 * blocking.
2096 sbinit(&sio, 1000000000);
2097 flags = MSG_DONTWAIT;
2098 error = so_pru_soreceive(so, &nam, NULL, &sio, NULL, &flags);
2099 if (error || sio.sb_mb == NULL) {
2100 if (error == EWOULDBLOCK)
2101 slp->ns_flag |= SLP_NEEDQ;
2102 else
2103 slp->ns_flag |= SLP_DISCONN;
2104 slp->ns_flag &= ~SLP_GETSTREAM;
2105 goto dorecs;
2107 m = sio.sb_mb;
2108 if (slp->ns_rawend) {
2109 slp->ns_rawend->m_next = m;
2110 slp->ns_cc += sio.sb_cc;
2111 } else {
2112 slp->ns_raw = m;
2113 slp->ns_cc = sio.sb_cc;
2115 while (m->m_next)
2116 m = m->m_next;
2117 slp->ns_rawend = m;
2120 * Now try and parse as many record(s) as we can out of the
2121 * raw stream data.
2123 error = nfsrv_getstream(slp, waitflag, &nparallel_wakeup);
2124 if (error) {
2125 if (error == EPERM)
2126 slp->ns_flag |= SLP_DISCONN;
2127 else
2128 slp->ns_flag |= SLP_NEEDQ;
2130 slp->ns_flag &= ~SLP_GETSTREAM;
2131 } else {
2133 * For UDP soreceive typically pulls just one packet, loop
2134 * to get the whole batch.
2136 do {
2137 sbinit(&sio, 1000000000);
2138 flags = MSG_DONTWAIT;
2139 error = so_pru_soreceive(so, &nam, NULL, &sio,
2140 NULL, &flags);
2141 if (sio.sb_mb) {
2142 struct nfsrv_rec *rec;
2143 int mf = (waitflag & MB_DONTWAIT) ?
2144 M_NOWAIT : M_WAITOK;
2145 rec = kmalloc(sizeof(struct nfsrv_rec),
2146 M_NFSRVDESC, mf);
2147 if (!rec) {
2148 if (nam)
2149 FREE(nam, M_SONAME);
2150 m_freem(sio.sb_mb);
2151 continue;
2153 nfs_realign(&sio.sb_mb, 10 * NFSX_UNSIGNED);
2154 rec->nr_address = nam;
2155 rec->nr_packet = sio.sb_mb;
2156 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2157 ++slp->ns_numrec;
2158 ++nparallel_wakeup;
2160 if (error) {
2161 if ((so->so_proto->pr_flags & PR_CONNREQUIRED)
2162 && error != EWOULDBLOCK) {
2163 slp->ns_flag |= SLP_DISCONN;
2164 goto dorecs;
2167 } while (sio.sb_mb);
2171 * If we were upcalled from the tcp protocol layer and we have
2172 * fully parsed records ready to go, or there is new data pending,
2173 * or something went wrong, try to wake up an nfsd thread to deal
2174 * with it.
2176 dorecs:
2177 if (waitflag == MB_DONTWAIT && (slp->ns_numrec > 0
2178 || (slp->ns_flag & (SLP_NEEDQ | SLP_DISCONN)))) {
2179 nfsrv_wakenfsd(slp, nparallel_wakeup);
2184 * Try and extract an RPC request from the mbuf data list received on a
2185 * stream socket. The "waitflag" argument indicates whether or not it
2186 * can sleep.
2188 static int
2189 nfsrv_getstream(struct nfssvc_sock *slp, int waitflag, int *countp)
2191 struct mbuf *m, **mpp;
2192 char *cp1, *cp2;
2193 int len;
2194 struct mbuf *om, *m2, *recm;
2195 u_int32_t recmark;
2197 for (;;) {
2198 if (slp->ns_reclen == 0) {
2199 if (slp->ns_cc < NFSX_UNSIGNED)
2200 return (0);
2201 m = slp->ns_raw;
2202 if (m->m_len >= NFSX_UNSIGNED) {
2203 bcopy(mtod(m, caddr_t), (caddr_t)&recmark, NFSX_UNSIGNED);
2204 m->m_data += NFSX_UNSIGNED;
2205 m->m_len -= NFSX_UNSIGNED;
2206 } else {
2207 cp1 = (caddr_t)&recmark;
2208 cp2 = mtod(m, caddr_t);
2209 while (cp1 < ((caddr_t)&recmark) + NFSX_UNSIGNED) {
2210 while (m->m_len == 0) {
2211 m = m->m_next;
2212 cp2 = mtod(m, caddr_t);
2214 *cp1++ = *cp2++;
2215 m->m_data++;
2216 m->m_len--;
2219 slp->ns_cc -= NFSX_UNSIGNED;
2220 recmark = ntohl(recmark);
2221 slp->ns_reclen = recmark & ~0x80000000;
2222 if (recmark & 0x80000000)
2223 slp->ns_flag |= SLP_LASTFRAG;
2224 else
2225 slp->ns_flag &= ~SLP_LASTFRAG;
2226 if (slp->ns_reclen > NFS_MAXPACKET || slp->ns_reclen <= 0) {
2227 log(LOG_ERR, "%s (%d) from nfs client\n",
2228 "impossible packet length",
2229 slp->ns_reclen);
2230 return (EPERM);
2235 * Now get the record part.
2237 * Note that slp->ns_reclen may be 0. Linux sometimes
2238 * generates 0-length RPCs
2240 recm = NULL;
2241 if (slp->ns_cc == slp->ns_reclen) {
2242 recm = slp->ns_raw;
2243 slp->ns_raw = slp->ns_rawend = (struct mbuf *)0;
2244 slp->ns_cc = slp->ns_reclen = 0;
2245 } else if (slp->ns_cc > slp->ns_reclen) {
2246 len = 0;
2247 m = slp->ns_raw;
2248 om = (struct mbuf *)0;
2250 while (len < slp->ns_reclen) {
2251 if ((len + m->m_len) > slp->ns_reclen) {
2252 m2 = m_copym(m, 0, slp->ns_reclen - len,
2253 waitflag);
2254 if (m2) {
2255 if (om) {
2256 om->m_next = m2;
2257 recm = slp->ns_raw;
2258 } else
2259 recm = m2;
2260 m->m_data += slp->ns_reclen - len;
2261 m->m_len -= slp->ns_reclen - len;
2262 len = slp->ns_reclen;
2263 } else {
2264 return (EWOULDBLOCK);
2266 } else if ((len + m->m_len) == slp->ns_reclen) {
2267 om = m;
2268 len += m->m_len;
2269 m = m->m_next;
2270 recm = slp->ns_raw;
2271 om->m_next = (struct mbuf *)0;
2272 } else {
2273 om = m;
2274 len += m->m_len;
2275 m = m->m_next;
2278 slp->ns_raw = m;
2279 slp->ns_cc -= len;
2280 slp->ns_reclen = 0;
2281 } else {
2282 return (0);
2286 * Accumulate the fragments into a record.
2288 mpp = &slp->ns_frag;
2289 while (*mpp)
2290 mpp = &((*mpp)->m_next);
2291 *mpp = recm;
2292 if (slp->ns_flag & SLP_LASTFRAG) {
2293 struct nfsrv_rec *rec;
2294 int mf = (waitflag & MB_DONTWAIT) ? M_NOWAIT : M_WAITOK;
2295 rec = kmalloc(sizeof(struct nfsrv_rec), M_NFSRVDESC, mf);
2296 if (!rec) {
2297 m_freem(slp->ns_frag);
2298 } else {
2299 nfs_realign(&slp->ns_frag, 10 * NFSX_UNSIGNED);
2300 rec->nr_address = (struct sockaddr *)0;
2301 rec->nr_packet = slp->ns_frag;
2302 STAILQ_INSERT_TAIL(&slp->ns_rec, rec, nr_link);
2303 ++slp->ns_numrec;
2304 ++*countp;
2306 slp->ns_frag = (struct mbuf *)0;
2312 * Parse an RPC header.
2315 nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd,
2316 struct nfsrv_descript **ndp)
2318 struct nfsrv_rec *rec;
2319 struct mbuf *m;
2320 struct sockaddr *nam;
2321 struct nfsrv_descript *nd;
2322 int error;
2324 *ndp = NULL;
2325 if ((slp->ns_flag & SLP_VALID) == 0 || !STAILQ_FIRST(&slp->ns_rec))
2326 return (ENOBUFS);
2327 rec = STAILQ_FIRST(&slp->ns_rec);
2328 STAILQ_REMOVE_HEAD(&slp->ns_rec, nr_link);
2329 KKASSERT(slp->ns_numrec > 0);
2330 --slp->ns_numrec;
2331 nam = rec->nr_address;
2332 m = rec->nr_packet;
2333 kfree(rec, M_NFSRVDESC);
2334 MALLOC(nd, struct nfsrv_descript *, sizeof (struct nfsrv_descript),
2335 M_NFSRVDESC, M_WAITOK);
2336 nd->nd_md = nd->nd_mrep = m;
2337 nd->nd_nam2 = nam;
2338 nd->nd_dpos = mtod(m, caddr_t);
2339 error = nfs_getreq(nd, nfsd, TRUE);
2340 if (error) {
2341 if (nam) {
2342 FREE(nam, M_SONAME);
2344 kfree((caddr_t)nd, M_NFSRVDESC);
2345 return (error);
2347 *ndp = nd;
2348 nfsd->nfsd_nd = nd;
2349 return (0);
2353 * Try to assign service sockets to nfsd threads based on the number
2354 * of new rpc requests that have been queued on the service socket.
2356 * If no nfsd's are available or additonal requests are pending, set the
2357 * NFSD_CHECKSLP flag so that one of the running nfsds will go look for
2358 * the work in the nfssvc_sock list when it is finished processing its
2359 * current work. This flag is only cleared when an nfsd can not find
2360 * any new work to perform.
2362 void
2363 nfsrv_wakenfsd(struct nfssvc_sock *slp, int nparallel)
2365 struct nfsd *nd;
2367 if ((slp->ns_flag & SLP_VALID) == 0)
2368 return;
2369 if (nparallel <= 1)
2370 nparallel = 1;
2371 TAILQ_FOREACH(nd, &nfsd_head, nfsd_chain) {
2372 if (nd->nfsd_flag & NFSD_WAITING) {
2373 nd->nfsd_flag &= ~NFSD_WAITING;
2374 if (nd->nfsd_slp)
2375 panic("nfsd wakeup");
2376 slp->ns_sref++;
2377 nd->nfsd_slp = slp;
2378 wakeup((caddr_t)nd);
2379 if (--nparallel == 0)
2380 break;
2383 if (nparallel) {
2384 slp->ns_flag |= SLP_DOREC;
2385 nfsd_head_flag |= NFSD_CHECKSLP;
2388 #endif /* NFS_NOSERVER */