2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
36 #include "opt_inet6.h"
37 #include "opt_tcpdebug.h"
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/callout.h>
42 #include <sys/kernel.h>
43 #include <sys/sysctl.h>
44 #include <sys/malloc.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/protosw.h>
51 #include <sys/random.h>
55 #include <net/route.h>
57 #include <net/if_var.h>
60 #include <netinet/in.h>
61 #include <netinet/in_pcb.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in_var.h>
64 #include <netinet/ip.h>
65 #include <netinet/ip_icmp.h>
66 #include <netinet/ip_var.h>
68 #include <netinet/ip6.h>
69 #include <netinet6/in6_pcb.h>
70 #include <netinet6/ip6_var.h>
71 #include <netinet6/scope6_var.h>
72 #include <netinet6/nd6.h>
74 #include <netinet/tcp.h>
75 #include <netinet/tcp_fsm.h>
76 #include <netinet/tcp_seq.h>
77 #include <netinet/tcp_timer.h>
78 #include <netinet/tcp_var.h>
80 #include <netinet6/tcp6_var.h>
82 #include <netinet/tcpip.h>
84 #include <netinet/tcp_debug.h>
87 #include <netinet6/ip6protosw.h>
90 #include <machine/in_cksum.h>
92 #include <security/mac/mac_framework.h>
94 static VNET_DEFINE(uma_zone_t
, tcptw_zone
);
95 #define V_tcptw_zone VNET(tcptw_zone)
99 * The timed wait queue contains references to each of the TCP sessions
100 * currently in the TIME_WAIT state. The queue pointers, including the
101 * queue pointers in each tcptw structure, are protected using the global
102 * timewait lock, which must be held over queue iteration and modification.
104 * Rules on tcptw usage:
105 * - a inpcb is always freed _after_ its tcptw
106 * - a tcptw relies on its inpcb reference counting for memory stability
107 * - a tcptw is dereferenceable only while its inpcb is locked
109 static VNET_DEFINE(TAILQ_HEAD(, tcptw
), twq_2msl
);
110 #define V_twq_2msl VNET(twq_2msl)
112 /* Global timewait lock */
113 static VNET_DEFINE(struct rwlock
, tw_lock
);
114 #define V_tw_lock VNET(tw_lock)
116 #define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0)
117 #define TW_LOCK_DESTROY(tw) rw_destroy(&(tw))
118 #define TW_RLOCK(tw) rw_rlock(&(tw))
119 #define TW_WLOCK(tw) rw_wlock(&(tw))
120 #define TW_RUNLOCK(tw) rw_runlock(&(tw))
121 #define TW_WUNLOCK(tw) rw_wunlock(&(tw))
122 #define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED)
123 #define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED)
124 #define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED)
125 #define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED)
127 static void tcp_tw_2msl_reset(struct tcptw
*, int);
128 static void tcp_tw_2msl_stop(struct tcptw
*, int);
129 static int tcp_twrespond(struct tcptw
*, int);
132 tcptw_auto_size(void)
137 * Max out at half the ephemeral port range so that TIME_WAIT
138 * sockets don't tie up too many ephemeral ports.
140 if (V_ipport_lastauto
> V_ipport_firstauto
)
141 halfrange
= (V_ipport_lastauto
- V_ipport_firstauto
) / 2;
143 halfrange
= (V_ipport_firstauto
- V_ipport_lastauto
) / 2;
144 /* Protect against goofy port ranges smaller than 32. */
145 return (imin(imax(halfrange
, 32), maxsockets
/ 5));
149 sysctl_maxtcptw(SYSCTL_HANDLER_ARGS
)
154 new = tcptw_auto_size();
157 error
= sysctl_handle_int(oidp
, &new, 0, req
);
158 if (error
== 0 && req
->newptr
)
161 uma_zone_set_max(V_tcptw_zone
, maxtcptw
);
166 SYSCTL_PROC(_net_inet_tcp
, OID_AUTO
, maxtcptw
, CTLTYPE_INT
|CTLFLAG_RW
,
167 &maxtcptw
, 0, sysctl_maxtcptw
, "IU",
168 "Maximum number of compressed TCP TIME_WAIT entries");
170 VNET_DEFINE(int, nolocaltimewait
) = 0;
171 #define V_nolocaltimewait VNET(nolocaltimewait)
172 SYSCTL_INT(_net_inet_tcp
, OID_AUTO
, nolocaltimewait
, CTLFLAG_VNET
| CTLFLAG_RW
,
173 &VNET_NAME(nolocaltimewait
), 0,
174 "Do not create compressed TCP TIME_WAIT entries for local connections");
177 tcp_tw_zone_change(void)
181 uma_zone_set_max(V_tcptw_zone
, tcptw_auto_size());
188 V_tcptw_zone
= uma_zcreate("tcptw", sizeof(struct tcptw
),
189 NULL
, NULL
, NULL
, NULL
, UMA_ALIGN_PTR
, 0);
190 TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw
);
192 uma_zone_set_max(V_tcptw_zone
, tcptw_auto_size());
194 uma_zone_set_max(V_tcptw_zone
, maxtcptw
);
195 TAILQ_INIT(&V_twq_2msl
);
196 TW_LOCK_INIT(V_tw_lock
, "tcptw");
205 INP_INFO_RLOCK(&V_tcbinfo
);
206 while ((tw
= TAILQ_FIRST(&V_twq_2msl
)) != NULL
)
208 INP_INFO_RUNLOCK(&V_tcbinfo
);
210 TW_LOCK_DESTROY(V_tw_lock
);
211 uma_zdestroy(V_tcptw_zone
);
216 * Move a TCP connection into TIME_WAIT state.
218 * inp is locked, and is unlocked before returning.
221 tcp_twstart(struct tcpcb
*tp
)
224 struct inpcb
*inp
= tp
->t_inpcb
;
228 int isipv6
= inp
->inp_inc
.inc_flags
& INC_ISIPV6
;
231 INP_INFO_RLOCK_ASSERT(&V_tcbinfo
);
232 INP_WLOCK_ASSERT(inp
);
234 if (V_nolocaltimewait
) {
238 error
= in6_localaddr(&inp
->in6p_faddr
);
240 #if defined(INET6) && defined(INET)
244 error
= in_localip(inp
->inp_faddr
);
256 * For use only by DTrace. We do not reference the state
257 * after this point so modifying it in place is not a problem.
259 tcp_state_change(tp
, TCPS_TIME_WAIT
);
261 tw
= uma_zalloc(V_tcptw_zone
, M_NOWAIT
);
264 * Reached limit on total number of TIMEWAIT connections
265 * allowed. Remove a connection from TIMEWAIT queue in LRU
266 * fashion to make room for this connection.
268 * XXX: Check if it possible to always have enough room
269 * in advance based on guarantees provided by uma_zalloc().
271 tw
= tcp_tw_2msl_scan(1);
280 * The tcptw will hold a reference on its inpcb until tcp_twclose
284 in_pcbref(inp
); /* Reference from tw */
287 * Recover last window size sent.
289 if (SEQ_GT(tp
->rcv_adv
, tp
->rcv_nxt
))
290 tw
->last_win
= (tp
->rcv_adv
- tp
->rcv_nxt
) >> tp
->rcv_scale
;
295 * Set t_recent if timestamps are used on the connection.
297 if ((tp
->t_flags
& (TF_REQ_TSTMP
|TF_RCVD_TSTMP
|TF_NOOPT
)) ==
298 (TF_REQ_TSTMP
|TF_RCVD_TSTMP
)) {
299 tw
->t_recent
= tp
->ts_recent
;
300 tw
->ts_offset
= tp
->ts_offset
;
306 tw
->snd_nxt
= tp
->snd_nxt
;
307 tw
->rcv_nxt
= tp
->rcv_nxt
;
310 tw
->t_starttime
= tp
->t_starttime
;
315 * be used for fin-wait-2 state also, then we may need
316 * a ts_recent from the last segment.
318 acknow
= tp
->t_flags
& TF_ACKNOW
;
321 * First, discard tcpcb state, which includes stopping its timers and
322 * freeing it. tcp_discardcb() used to also release the inpcb, but
323 * that work is now done in the caller.
325 * Note: soisdisconnected() call used to be made in tcp_discardcb(),
326 * and might not be needed here any longer.
329 so
= inp
->inp_socket
;
330 soisdisconnected(so
);
331 tw
->tw_cred
= crhold(so
->so_cred
);
333 tw
->tw_so_options
= so
->so_options
;
336 tcp_twrespond(tw
, TH_ACK
);
338 inp
->inp_flags
|= INP_TIMEWAIT
;
339 tcp_tw_2msl_reset(tw
, 0);
342 * If the inpcb owns the sole reference to the socket, then we can
343 * detach and free the socket as it is not needed in time wait.
345 if (inp
->inp_flags
& INP_SOCKREF
) {
346 KASSERT(so
->so_state
& SS_PROTOREF
,
347 ("tcp_twstart: !SS_PROTOREF"));
348 inp
->inp_flags
&= ~INP_SOCKREF
;
352 so
->so_state
&= ~SS_PROTOREF
;
359 * Returns 1 if the TIME_WAIT state was killed and we should start over,
360 * looking for a pcb in the listen state. Returns 0 otherwise.
363 tcp_twcheck(struct inpcb
*inp
, struct tcpopt
*to __unused
, struct tcphdr
*th
,
364 struct mbuf
*m
, int tlen
)
370 INP_INFO_RLOCK_ASSERT(&V_tcbinfo
);
371 INP_WLOCK_ASSERT(inp
);
374 * XXXRW: Time wait state for inpcb has been recycled, but inpcb is
375 * still present. This is undesirable, but temporarily necessary
376 * until we work out how to handle inpcb's who's timewait state has
383 thflags
= th
->th_flags
;
386 * NOTE: for FIN_WAIT_2 (to be added later),
387 * must validate sequence number before accepting RST
391 * If the segment contains RST:
392 * Drop the segment - see Stevens, vol. 2, p. 964 and
395 if (thflags
& TH_RST
)
399 /* PAWS not needed at the moment */
401 * RFC 1323 PAWS: If we have a timestamp reply on this segment
402 * and it's less than ts_recent, drop it.
404 if ((to
.to_flags
& TOF_TS
) != 0 && tp
->ts_recent
&&
405 TSTMP_LT(to
.to_tsval
, tp
->ts_recent
)) {
406 if ((thflags
& TH_ACK
) == 0)
411 * ts_recent is never updated because we never accept new segments.
416 * If a new connection request is received
417 * while in TIME_WAIT, drop the old connection
418 * and start over if the sequence numbers
419 * are above the previous ones.
421 if ((thflags
& TH_SYN
) && SEQ_GT(th
->th_seq
, tw
->rcv_nxt
)) {
427 * Drop the segment if it does not contain an ACK.
429 if ((thflags
& TH_ACK
) == 0)
433 * Reset the 2MSL timer if this is a duplicate FIN.
435 if (thflags
& TH_FIN
) {
436 seq
= th
->th_seq
+ tlen
+ (thflags
& TH_SYN
? 1 : 0);
437 if (seq
+ 1 == tw
->rcv_nxt
)
438 tcp_tw_2msl_reset(tw
, 1);
442 * Acknowledge the segment if it has data or is not a duplicate ACK.
444 if (thflags
!= TH_ACK
|| tlen
!= 0 ||
445 th
->th_seq
!= tw
->rcv_nxt
|| th
->th_ack
!= tw
->snd_nxt
)
446 tcp_twrespond(tw
, TH_ACK
);
454 tcp_twclose(struct tcptw
*tw
, int reuse
)
460 * At this point, we are in one of two situations:
462 * (1) We have no socket, just an inpcb<->twtcp pair. We can free
465 * (2) We have a socket -- if we own a reference, release it and
466 * notify the socket layer.
469 KASSERT((inp
->inp_flags
& INP_TIMEWAIT
), ("tcp_twclose: !timewait"));
470 KASSERT(intotw(inp
) == tw
, ("tcp_twclose: inp_ppcb != tw"));
471 INP_INFO_RLOCK_ASSERT(&V_tcbinfo
); /* in_pcbfree() */
472 INP_WLOCK_ASSERT(inp
);
474 tcp_tw_2msl_stop(tw
, reuse
);
475 inp
->inp_ppcb
= NULL
;
478 so
= inp
->inp_socket
;
481 * If there's a socket, handle two cases: first, we own a
482 * strong reference, which we will now release, or we don't
483 * in which case another reference exists (XXXRW: think
484 * about this more), and we don't need to take action.
486 if (inp
->inp_flags
& INP_SOCKREF
) {
487 inp
->inp_flags
&= ~INP_SOCKREF
;
491 KASSERT(so
->so_state
& SS_PROTOREF
,
492 ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF"));
493 so
->so_state
&= ~SS_PROTOREF
;
497 * If we don't own the only reference, the socket and
498 * inpcb need to be left around to be handled by
499 * tcp_usr_detach() later.
505 * The socket has been already cleaned-up for us, only free the
510 TCPSTAT_INC(tcps_closed
);
514 tcp_twrespond(struct tcptw
*tw
, int flags
)
516 struct inpcb
*inp
= tw
->tw_inpcb
;
517 #if defined(INET6) || defined(INET)
518 struct tcphdr
*th
= NULL
;
522 struct ip
*ip
= NULL
;
524 u_int hdrlen
, optlen
;
525 int error
= 0; /* Keep compiler happy */
528 struct ip6_hdr
*ip6
= NULL
;
529 int isipv6
= inp
->inp_inc
.inc_flags
& INC_ISIPV6
;
531 hdrlen
= 0; /* Keep compiler happy */
533 INP_WLOCK_ASSERT(inp
);
535 m
= m_gethdr(M_NOWAIT
, MT_DATA
);
538 m
->m_data
+= max_linkhdr
;
541 mac_inpcb_create_mbuf(inp
, m
);
546 hdrlen
= sizeof(struct ip6_hdr
) + sizeof(struct tcphdr
);
547 ip6
= mtod(m
, struct ip6_hdr
*);
548 th
= (struct tcphdr
*)(ip6
+ 1);
549 tcpip_fillheaders(inp
, ip6
, th
);
552 #if defined(INET6) && defined(INET)
557 hdrlen
= sizeof(struct tcpiphdr
);
558 ip
= mtod(m
, struct ip
*);
559 th
= (struct tcphdr
*)(ip
+ 1);
560 tcpip_fillheaders(inp
, ip
, th
);
566 * Send a timestamp and echo-reply if both our side and our peer
567 * have sent timestamps in our SYN's and this is not a RST.
569 if (tw
->t_recent
&& flags
== TH_ACK
) {
570 to
.to_flags
|= TOF_TS
;
571 to
.to_tsval
= tcp_ts_getticks() + tw
->ts_offset
;
572 to
.to_tsecr
= tw
->t_recent
;
574 optlen
= tcp_addoptions(&to
, (u_char
*)(th
+ 1));
576 m
->m_len
= hdrlen
+ optlen
;
577 m
->m_pkthdr
.len
= m
->m_len
;
579 KASSERT(max_linkhdr
+ m
->m_len
<= MHLEN
, ("tcptw: mbuf too small"));
581 th
->th_seq
= htonl(tw
->snd_nxt
);
582 th
->th_ack
= htonl(tw
->rcv_nxt
);
583 th
->th_off
= (sizeof(struct tcphdr
) + optlen
) >> 2;
584 th
->th_flags
= flags
;
585 th
->th_win
= htons(tw
->last_win
);
587 m
->m_pkthdr
.csum_data
= offsetof(struct tcphdr
, th_sum
);
590 m
->m_pkthdr
.csum_flags
= CSUM_TCP_IPV6
;
591 th
->th_sum
= in6_cksum_pseudo(ip6
,
592 sizeof(struct tcphdr
) + optlen
, IPPROTO_TCP
, 0);
593 ip6
->ip6_hlim
= in6_selecthlim(inp
, NULL
);
594 error
= ip6_output(m
, inp
->in6p_outputopts
, NULL
,
595 (tw
->tw_so_options
& SO_DONTROUTE
), NULL
, NULL
, inp
);
598 #if defined(INET6) && defined(INET)
603 m
->m_pkthdr
.csum_flags
= CSUM_TCP
;
604 th
->th_sum
= in_pseudo(ip
->ip_src
.s_addr
, ip
->ip_dst
.s_addr
,
605 htons(sizeof(struct tcphdr
) + optlen
+ IPPROTO_TCP
));
606 ip
->ip_len
= htons(m
->m_pkthdr
.len
);
607 if (V_path_mtu_discovery
)
608 ip
->ip_off
|= htons(IP_DF
);
609 error
= ip_output(m
, inp
->inp_options
, NULL
,
610 ((tw
->tw_so_options
& SO_DONTROUTE
) ? IP_ROUTETOIF
: 0),
615 TCPSTAT_INC(tcps_sndacks
);
617 TCPSTAT_INC(tcps_sndctrl
);
618 TCPSTAT_INC(tcps_sndtotal
);
623 tcp_tw_2msl_reset(struct tcptw
*tw
, int rearm
)
626 INP_INFO_RLOCK_ASSERT(&V_tcbinfo
);
627 INP_WLOCK_ASSERT(tw
->tw_inpcb
);
631 TAILQ_REMOVE(&V_twq_2msl
, tw
, tw_2msl
);
632 tw
->tw_time
= ticks
+ 2 * tcp_msl
;
633 TAILQ_INSERT_TAIL(&V_twq_2msl
, tw
, tw_2msl
);
634 TW_WUNLOCK(V_tw_lock
);
638 tcp_tw_2msl_stop(struct tcptw
*tw
, int reuse
)
644 INP_INFO_RLOCK_ASSERT(&V_tcbinfo
);
650 TAILQ_REMOVE(&V_twq_2msl
, tw
, tw_2msl
);
653 TW_WUNLOCK(V_tw_lock
);
658 released
= in_pcbrele_wlocked(inp
);
659 KASSERT(!released
, ("%s: inp should not be released here", __func__
));
662 uma_zfree(V_tcptw_zone
, tw
);
663 TCPSTATES_DEC(TCPS_TIME_WAIT
);
667 tcp_tw_2msl_scan(int reuse
)
675 * Exclusive pcbinfo lock is not required in reuse case even if
676 * two inpcb locks can be acquired simultaneously:
677 * - the inpcb transitioning to TIME_WAIT state in
679 * - the inpcb closed by tcp_twclose().
681 * It is because only inpcbs in FIN_WAIT2 or CLOSING states can
682 * transition in TIME_WAIT state. Then a pcbcb cannot be in
683 * TIME_WAIT list and transitioning to TIME_WAIT state at same
686 INP_INFO_RLOCK_ASSERT(&V_tcbinfo
);
692 tw
= TAILQ_FIRST(&V_twq_2msl
);
693 if (tw
== NULL
|| (!reuse
&& (tw
->tw_time
- ticks
) > 0)) {
694 TW_RUNLOCK(V_tw_lock
);
697 KASSERT(tw
->tw_inpcb
!= NULL
, ("%s: tw->tw_inpcb == NULL",
702 TW_RUNLOCK(V_tw_lock
);
704 if (INP_INFO_TRY_RLOCK(&V_tcbinfo
)) {
708 if (in_pcbrele_wlocked(inp
)) {
709 KASSERT(tw
== NULL
, ("%s: held last inp "
710 "reference but tw not NULL", __func__
));
711 INP_INFO_RUNLOCK(&V_tcbinfo
);
716 /* tcp_twclose() has already been called */
718 INP_INFO_RUNLOCK(&V_tcbinfo
);
722 tcp_twclose(tw
, reuse
);
723 INP_INFO_RUNLOCK(&V_tcbinfo
);
727 /* INP_INFO lock is busy, continue later. */
729 if (!in_pcbrele_wlocked(inp
))