4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2011 Joyent, Inc. All rights reserved.
26 * Copyright (c) 2014 by Delphix. All rights reserved.
29 #include <sys/types.h>
30 #include <sys/strlog.h>
31 #include <sys/strsun.h>
32 #include <sys/squeue_impl.h>
33 #include <sys/squeue.h>
34 #include <sys/callo.h>
35 #include <sys/strsubr.h>
37 #include <inet/common.h>
39 #include <inet/ip_ire.h>
40 #include <inet/ip_rts.h>
42 #include <inet/tcp_impl.h>
45 * Implementation of TCP Timers.
46 * =============================
50 * There are two basic functions dealing with tcp timers:
52 * timeout_id_t tcp_timeout(connp, func, time)
53 * clock_t tcp_timeout_cancel(connp, timeout_id)
54 * TCP_TIMER_RESTART(tcp, intvl)
56 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
57 * after 'time' ticks passed. The function called by timeout() must adhere to
58 * the same restrictions as a driver soft interrupt handler - it must not sleep
59 * or call other functions that might sleep. The value returned is the opaque
60 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
61 * cancel the request. The call to tcp_timeout() may fail in which case it
62 * returns zero. This is different from the timeout(9F) function which never
65 * The call-back function 'func' always receives 'connp' as its single
66 * argument. It is always executed in the squeue corresponding to the tcp
67 * structure. The tcp structure is guaranteed to be present at the time the
68 * call-back is called.
70 * NOTE: The call-back function 'func' is never called if tcp is in
71 * the TCPS_CLOSED state.
73 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
74 * request. locks acquired by the call-back routine should not be held across
75 * the call to tcp_timeout_cancel() or a deadlock may result.
77 * tcp_timeout_cancel() returns -1 if the timeout request is invalid.
78 * Otherwise, it returns an integer value greater than or equal to 0.
80 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
81 * within squeue context corresponding to the tcp instance. Since the
82 * call-back is also called via the same squeue, there are no race
83 * conditions described in untimeout(9F) manual page since all calls are
84 * strictly serialized.
86 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
87 * stored in tcp_timer_tid and starts a new one using
88 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
89 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
94 * TCP timers are implemented using three-stage process. The call to
95 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
96 * when the timer expires. The tcp_timer_callback() arranges the call of the
97 * tcp_timer_handler() function via squeue corresponding to the tcp
98 * instance. The tcp_timer_handler() calls actual requested timeout call-back
99 * and passes tcp instance as an argument to it. Information is passed between
100 * stages using the tcp_timer_t structure which contains the connp pointer, the
101 * tcp call-back to call and the timeout id returned by the timeout(9F).
103 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
104 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
105 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
106 * returns the pointer to this mblk.
108 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
109 * looks like a normal mblk without actual dblk attached to it.
111 * To optimize performance each tcp instance holds a small cache of timer
112 * mblocks. In the current implementation it caches up to two timer mblocks per
113 * tcp instance. The cache is preserved over tcp frees and is only freed when
114 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
115 * timer processing happens on a corresponding squeue, the cache manipulation
116 * does not require any locks. Experiments show that majority of timer mblocks
117 * allocations are satisfied from the tcp cache and do not involve kmem calls.
119 * The tcp_timeout() places a refhold on the connp instance which guarantees
120 * that it will be present at the time the call-back function fires. The
121 * tcp_timer_handler() drops the reference after calling the call-back, so the
122 * call-back function does not need to manipulate the references explicitly.
125 kmem_cache_t
*tcp_timercache
;
127 static void tcp_ip_notify(tcp_t
*);
128 static void tcp_timer_callback(void *);
129 static void tcp_timer_free(tcp_t
*, mblk_t
*);
130 static void tcp_timer_handler(void *, mblk_t
*, void *, ip_recv_attr_t
*);
133 * tim is in millisec.
136 tcp_timeout(conn_t
*connp
, void (*f
)(void *), hrtime_t tim
)
140 tcp_t
*tcp
= connp
->conn_tcp
;
142 ASSERT(connp
->conn_sqp
!= NULL
);
144 TCP_DBGSTAT(tcp
->tcp_tcps
, tcp_timeout_calls
);
146 if (tcp
->tcp_timercache
== NULL
) {
147 mp
= tcp_timermp_alloc(KM_NOSLEEP
| KM_PANIC
);
149 TCP_DBGSTAT(tcp
->tcp_tcps
, tcp_timeout_cached_alloc
);
150 mp
= tcp
->tcp_timercache
;
151 tcp
->tcp_timercache
= mp
->b_next
;
153 ASSERT(mp
->b_wptr
== NULL
);
157 tcpt
= (tcp_timer_t
*)mp
->b_rptr
;
161 * TCP timers are normal timeouts. Plus, they do not require more than
162 * a 10 millisecond resolution. By choosing a coarser resolution and by
163 * rounding up the expiration to the next resolution boundary, we can
164 * batch timers in the callout subsystem to make TCP timers more
165 * efficient. The roundup also protects short timers from expiring too
166 * early before they have a chance to be cancelled.
168 tcpt
->tcpt_tid
= timeout_generic(CALLOUT_NORMAL
, tcp_timer_callback
, mp
,
169 tim
* MICROSEC
, CALLOUT_TCP_RESOLUTION
, CALLOUT_FLAG_ROUNDUP
);
170 VERIFY(!(tcpt
->tcpt_tid
& CALLOUT_ID_FREE
));
172 return ((timeout_id_t
)mp
);
176 tcp_timer_callback(void *arg
)
178 mblk_t
*mp
= (mblk_t
*)arg
;
182 tcpt
= (tcp_timer_t
*)mp
->b_rptr
;
184 SQUEUE_ENTER_ONE(connp
->conn_sqp
, mp
, tcp_timer_handler
, connp
,
185 NULL
, SQ_FILL
, SQTAG_TCP_TIMER
);
190 tcp_timer_handler(void *arg
, mblk_t
*mp
, void *arg2
, ip_recv_attr_t
*dummy
)
193 conn_t
*connp
= (conn_t
*)arg
;
194 tcp_t
*tcp
= connp
->conn_tcp
;
196 tcpt
= (tcp_timer_t
*)mp
->b_rptr
;
197 ASSERT(connp
== tcpt
->connp
);
198 ASSERT((squeue_t
*)arg2
== connp
->conn_sqp
);
200 if (tcpt
->tcpt_tid
& CALLOUT_ID_FREE
) {
202 * This timeout was cancelled after it was enqueued to the
203 * squeue; free the timer and return.
205 tcp_timer_free(connp
->conn_tcp
, mp
);
210 * If the TCP has reached the closed state, don't proceed any
211 * further. This TCP logically does not exist on the system.
212 * tcpt_proc could for example access queues, that have already
213 * been qprocoff'ed off.
215 if (tcp
->tcp_state
!= TCPS_CLOSED
) {
216 (*tcpt
->tcpt_proc
)(connp
);
218 tcp
->tcp_timer_tid
= 0;
221 tcp_timer_free(connp
->conn_tcp
, mp
);
225 * There is potential race with untimeout and the handler firing at the same
226 * time. The mblock may be freed by the handler while we are trying to use
227 * it. But since both should execute on the same squeue, this race should not
231 tcp_timeout_cancel(conn_t
*connp
, timeout_id_t id
)
233 mblk_t
*mp
= (mblk_t
*)id
;
237 TCP_DBGSTAT(connp
->conn_tcp
->tcp_tcps
, tcp_timeout_cancel_reqs
);
242 tcpt
= (tcp_timer_t
*)mp
->b_rptr
;
243 ASSERT(tcpt
->connp
== connp
);
245 delta
= untimeout_default(tcpt
->tcpt_tid
, 0);
248 TCP_DBGSTAT(connp
->conn_tcp
->tcp_tcps
, tcp_timeout_canceled
);
249 tcp_timer_free(connp
->conn_tcp
, mp
);
253 * If we were unable to untimeout successfully, it has already
254 * been enqueued on the squeue; mark the ID with the free
255 * bit. This bit can never be set in a valid identifier, and
256 * we'll use it to prevent the timeout from being executed.
257 * And note that we're within the squeue perimeter here, so
258 * we don't need to worry about racing with timer handling
259 * (which also executes within the perimeter).
261 tcpt
->tcpt_tid
|= CALLOUT_ID_FREE
;
265 return (TICK_TO_MSEC(delta
));
269 * Allocate space for the timer event. The allocation looks like mblk, but it is
270 * not a proper mblk. To avoid confusion we set b_wptr to NULL.
272 * Dealing with failures: If we can't allocate from the timer cache we try
273 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
275 * If we can't allocate anything using allocb_tryhard(), we perform a last
276 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
277 * save the actual allocation size in b_datap.
280 tcp_timermp_alloc(int kmflags
)
282 mblk_t
*mp
= (mblk_t
*)kmem_cache_alloc(tcp_timercache
,
283 kmflags
& ~KM_PANIC
);
286 mp
->b_next
= mp
->b_prev
= NULL
;
287 mp
->b_rptr
= (uchar_t
*)(&mp
[1]);
292 } else if (kmflags
& KM_PANIC
) {
294 * Failed to allocate memory for the timer. Try allocating from
297 /* ipclassifier calls this from a constructor - hence no tcps */
298 TCP_G_STAT(tcp_timermp_allocfail
);
299 mp
= allocb_tryhard(sizeof (tcp_timer_t
));
303 * Memory is really low. Try tryhard allocation.
305 * ipclassifier calls this from a constructor -
308 TCP_G_STAT(tcp_timermp_allocdblfail
);
309 mp
= kmem_alloc_tryhard(sizeof (mblk_t
) +
310 sizeof (tcp_timer_t
), &size
, kmflags
);
311 mp
->b_rptr
= (uchar_t
*)(&mp
[1]);
312 mp
->b_next
= mp
->b_prev
= NULL
;
313 mp
->b_wptr
= (uchar_t
*)-1;
314 mp
->b_datap
= (dblk_t
*)size
;
318 ASSERT(mp
->b_wptr
!= NULL
);
320 /* ipclassifier calls this from a constructor - hence no tcps */
321 TCP_G_DBGSTAT(tcp_timermp_alloced
);
327 * Free per-tcp timer cache.
328 * It can only contain entries from tcp_timercache.
331 tcp_timermp_free(tcp_t
*tcp
)
335 while ((mp
= tcp
->tcp_timercache
) != NULL
) {
336 ASSERT(mp
->b_wptr
== NULL
);
337 tcp
->tcp_timercache
= tcp
->tcp_timercache
->b_next
;
338 kmem_cache_free(tcp_timercache
, mp
);
343 * Free timer event. Put it on the per-tcp timer cache if there is not too many
344 * events there already (currently at most two events are cached).
345 * If the event is not allocated from the timer cache, free it right away.
348 tcp_timer_free(tcp_t
*tcp
, mblk_t
*mp
)
350 mblk_t
*mp1
= tcp
->tcp_timercache
;
352 if (mp
->b_wptr
!= NULL
) {
354 * This allocation is not from a timer cache, free it right
357 if (mp
->b_wptr
!= (uchar_t
*)-1)
360 kmem_free(mp
, (size_t)mp
->b_datap
);
361 } else if (mp1
== NULL
|| mp1
->b_next
== NULL
) {
362 /* Cache this timer block for future allocations */
363 mp
->b_rptr
= (uchar_t
*)(&mp
[1]);
365 tcp
->tcp_timercache
= mp
;
367 kmem_cache_free(tcp_timercache
, mp
);
368 TCP_DBGSTAT(tcp
->tcp_tcps
, tcp_timermp_freed
);
373 * Stop all TCP timers.
376 tcp_timers_stop(tcp_t
*tcp
)
378 if (tcp
->tcp_timer_tid
!= 0) {
379 (void) TCP_TIMER_CANCEL(tcp
, tcp
->tcp_timer_tid
);
380 tcp
->tcp_timer_tid
= 0;
382 if (tcp
->tcp_ka_tid
!= 0) {
383 (void) TCP_TIMER_CANCEL(tcp
, tcp
->tcp_ka_tid
);
386 if (tcp
->tcp_ack_tid
!= 0) {
387 (void) TCP_TIMER_CANCEL(tcp
, tcp
->tcp_ack_tid
);
388 tcp
->tcp_ack_tid
= 0;
390 if (tcp
->tcp_push_tid
!= 0) {
391 (void) TCP_TIMER_CANCEL(tcp
, tcp
->tcp_push_tid
);
392 tcp
->tcp_push_tid
= 0;
394 if (tcp
->tcp_reass_tid
!= 0) {
395 (void) TCP_TIMER_CANCEL(tcp
, tcp
->tcp_reass_tid
);
396 tcp
->tcp_reass_tid
= 0;
401 * Timer callback routine for keepalive probe. We do a fake resend of
402 * last ACKed byte. Then set a timer using RTO. When the timer expires,
403 * check to see if we have heard anything from the other end for the last
404 * RTO period. If we have, set the timer to expire for another
405 * tcp_keepalive_intrvl and check again. If we have not, set a timer using
406 * RTO << 1 and check again when it expires. Keep exponentially increasing
407 * the timeout if we have not heard from the other side. If for more than
408 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
409 * kill the connection unless the keepalive abort threshold is 0. In
410 * that case, we will probe "forever."
411 * If tcp_ka_cnt and tcp_ka_rinterval are non-zero, then we do not follow
412 * the exponential backoff, but send probes tcp_ka_cnt times in regular
413 * intervals of tcp_ka_rinterval milliseconds until we hear back from peer.
414 * Kill the connection if we don't hear back from peer after tcp_ka_cnt
418 tcp_keepalive_timer(void *arg
)
421 conn_t
*connp
= (conn_t
*)arg
;
422 tcp_t
*tcp
= connp
->conn_tcp
;
426 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
433 TCPS_BUMP_MIB(tcps
, tcpTimKeepalive
);
434 ka_intrvl
= tcp
->tcp_ka_interval
;
437 * Keepalive probe should only be sent if the application has not
438 * done a close on the connection.
440 if (tcp
->tcp_state
> TCPS_CLOSE_WAIT
) {
443 /* Timer fired too early, restart it. */
444 if (tcp
->tcp_state
< TCPS_ESTABLISHED
) {
445 tcp
->tcp_ka_tid
= TCP_TIMER(tcp
, tcp_keepalive_timer
,
450 idletime
= TICK_TO_MSEC(ddi_get_lbolt() - tcp
->tcp_last_recv_time
);
452 * If we have not heard from the other side for a long
453 * time, kill the connection unless the keepalive abort
454 * threshold is 0. In that case, we will probe "forever."
456 if (tcp
->tcp_ka_abort_thres
!= 0 &&
457 idletime
> (ka_intrvl
+ tcp
->tcp_ka_abort_thres
)) {
458 TCPS_BUMP_MIB(tcps
, tcpTimKeepaliveDrop
);
459 (void) tcp_clean_death(tcp
, tcp
->tcp_client_errno
?
460 tcp
->tcp_client_errno
: ETIMEDOUT
);
464 if (tcp
->tcp_snxt
== tcp
->tcp_suna
&&
465 idletime
>= ka_intrvl
) {
466 /* Fake resend of last ACKed byte. */
467 mblk_t
*mp1
= allocb(1, BPRI_LO
);
470 *mp1
->b_wptr
++ = '\0';
471 mp
= tcp_xmit_mp(tcp
, mp1
, 1, NULL
, NULL
,
472 tcp
->tcp_suna
- 1, B_FALSE
, NULL
, B_TRUE
);
475 * if allocation failed, fall through to start the
479 tcp_send_data(tcp
, mp
);
480 TCPS_BUMP_MIB(tcps
, tcpTimKeepaliveProbe
);
481 if (tcp
->tcp_ka_rinterval
) {
482 firetime
= tcp
->tcp_ka_rinterval
;
483 } else if (tcp
->tcp_ka_last_intrvl
!= 0) {
486 * We should probe again at least
487 * in ka_intrvl, but not more than
490 max
= tcp
->tcp_rto_max
;
491 firetime
= MIN(ka_intrvl
- 1,
492 tcp
->tcp_ka_last_intrvl
<< 1);
496 firetime
= tcp
->tcp_rto
;
498 tcp
->tcp_ka_tid
= TCP_TIMER(tcp
,
499 tcp_keepalive_timer
, firetime
);
500 tcp
->tcp_ka_last_intrvl
= firetime
;
505 tcp
->tcp_ka_last_intrvl
= 0;
508 /* firetime can be negative if (mp1 == NULL || mp == NULL) */
509 if ((firetime
= ka_intrvl
- idletime
) < 0) {
510 firetime
= ka_intrvl
;
512 tcp
->tcp_ka_tid
= TCP_TIMER(tcp
, tcp_keepalive_timer
, firetime
);
516 tcp_reass_timer(void *arg
)
518 conn_t
*connp
= (conn_t
*)arg
;
519 tcp_t
*tcp
= connp
->conn_tcp
;
521 tcp
->tcp_reass_tid
= 0;
522 if (tcp
->tcp_reass_head
== NULL
)
524 ASSERT(tcp
->tcp_reass_tail
!= NULL
);
525 if (tcp
->tcp_snd_sack_ok
&& tcp
->tcp_num_sack_blk
> 0) {
526 tcp_sack_remove(tcp
->tcp_sack_list
,
527 TCP_REASS_END(tcp
->tcp_reass_tail
), &tcp
->tcp_num_sack_blk
);
529 tcp_close_mpp(&tcp
->tcp_reass_head
);
530 tcp
->tcp_reass_tail
= NULL
;
531 TCP_STAT(tcp
->tcp_tcps
, tcp_reass_timeout
);
534 /* This function handles the push timeout. */
536 tcp_push_timer(void *arg
)
538 conn_t
*connp
= (conn_t
*)arg
;
539 tcp_t
*tcp
= connp
->conn_tcp
;
541 TCP_DBGSTAT(tcp
->tcp_tcps
, tcp_push_timer_cnt
);
543 ASSERT(tcp
->tcp_listener
== NULL
);
545 ASSERT(!IPCL_IS_NONSTR(connp
));
547 tcp
->tcp_push_tid
= 0;
549 if (tcp
->tcp_rcv_list
!= NULL
&&
550 tcp_rcv_drain(tcp
) == TH_ACK_NEEDED
)
551 tcp_xmit_ctl(NULL
, tcp
, tcp
->tcp_snxt
, tcp
->tcp_rnxt
, TH_ACK
);
555 * This function handles delayed ACK timeout.
558 tcp_ack_timer(void *arg
)
560 conn_t
*connp
= (conn_t
*)arg
;
561 tcp_t
*tcp
= connp
->conn_tcp
;
563 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
565 TCP_DBGSTAT(tcps
, tcp_ack_timer_cnt
);
567 tcp
->tcp_ack_tid
= 0;
573 * Do not send ACK if there is no outstanding unack'ed data.
575 if (tcp
->tcp_rnxt
== tcp
->tcp_rack
) {
579 if ((tcp
->tcp_rnxt
- tcp
->tcp_rack
) > tcp
->tcp_mss
) {
581 * Make sure we don't allow deferred ACKs to result in
582 * timer-based ACKing. If we have held off an ACK
583 * when there was more than an mss here, and the timer
584 * goes off, we have to worry about the possibility
585 * that the sender isn't doing slow-start, or is out
586 * of step with us for some other reason. We fall
587 * permanently back in the direction of
588 * ACK-every-other-packet as suggested in RFC 1122.
590 if (tcp
->tcp_rack_abs_max
> 2)
591 tcp
->tcp_rack_abs_max
--;
592 tcp
->tcp_rack_cur_max
= 2;
594 mp
= tcp_ack_mp(tcp
);
597 BUMP_LOCAL(tcp
->tcp_obsegs
);
598 TCPS_BUMP_MIB(tcps
, tcpOutAck
);
599 TCPS_BUMP_MIB(tcps
, tcpOutAckDelayed
);
600 tcp_send_data(tcp
, mp
);
605 * Notify IP that we are having trouble with this connection. IP should
606 * make note so it can potentially use a different IRE.
609 tcp_ip_notify(tcp_t
*tcp
)
611 conn_t
*connp
= tcp
->tcp_connp
;
615 * Note: in the case of source routing we want to blow away the
616 * route to the first source route hop.
618 ire
= connp
->conn_ixa
->ixa_ire
;
619 if (ire
!= NULL
&& !(ire
->ire_flags
& (RTF_REJECT
|RTF_BLACKHOLE
))) {
620 if (ire
->ire_ipversion
== IPV4_VERSION
) {
622 * As per RFC 1122, we send an RTM_LOSING to inform
625 ip_rts_change(RTM_LOSING
, ire
->ire_addr
,
626 ire
->ire_gateway_addr
, ire
->ire_mask
,
627 connp
->conn_laddr_v4
, 0, 0, 0,
628 (RTA_DST
| RTA_GATEWAY
| RTA_NETMASK
| RTA_IFA
),
631 (void) ire_no_good(ire
);
636 * tcp_timer is the timer service routine. It handles the retransmission,
637 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out
638 * from the state of the tcp instance what kind of action needs to be done
639 * at the time it is called.
645 clock_t first_threshold
;
646 clock_t second_threshold
;
649 conn_t
*connp
= (conn_t
*)arg
;
650 tcp_t
*tcp
= connp
->conn_tcp
;
651 tcp_stack_t
*tcps
= tcp
->tcp_tcps
;
652 boolean_t dont_timeout
= B_FALSE
;
654 tcp
->tcp_timer_tid
= 0;
659 first_threshold
= tcp
->tcp_first_timer_threshold
;
660 second_threshold
= tcp
->tcp_second_timer_threshold
;
661 switch (tcp
->tcp_state
) {
666 case TCPS_SYN_RCVD
: {
667 tcp_t
*listener
= tcp
->tcp_listener
;
669 if (tcp
->tcp_syn_rcvd_timeout
== 0 && (listener
!= NULL
)) {
670 /* it's our first timeout */
671 tcp
->tcp_syn_rcvd_timeout
= 1;
672 mutex_enter(&listener
->tcp_eager_lock
);
673 listener
->tcp_syn_rcvd_timeout
++;
674 if (!tcp
->tcp_dontdrop
&& !tcp
->tcp_closemp_used
) {
676 * Make this eager available for drop if we
677 * need to drop one to accomodate a new
678 * incoming SYN request.
680 MAKE_DROPPABLE(listener
, tcp
);
682 if (!listener
->tcp_syn_defense
&&
683 (listener
->tcp_syn_rcvd_timeout
>
684 (tcps
->tcps_conn_req_max_q0
>> 2)) &&
685 (tcps
->tcps_conn_req_max_q0
> 200)) {
686 /* We may be under attack. Put on a defense. */
687 listener
->tcp_syn_defense
= B_TRUE
;
688 cmn_err(CE_WARN
, "High TCP connect timeout "
689 "rate! System (port %d) may be under a "
691 ntohs(listener
->tcp_connp
->conn_lport
));
693 listener
->tcp_ip_addr_cache
= kmem_zalloc(
694 IP_ADDR_CACHE_SIZE
* sizeof (ipaddr_t
),
697 mutex_exit(&listener
->tcp_eager_lock
);
698 } else if (listener
!= NULL
) {
699 mutex_enter(&listener
->tcp_eager_lock
);
700 tcp
->tcp_syn_rcvd_timeout
++;
701 if (tcp
->tcp_syn_rcvd_timeout
> 1 &&
702 !tcp
->tcp_closemp_used
) {
704 * This is our second timeout. Put the tcp in
705 * the list of droppable eagers to allow it to
706 * be dropped, if needed. We don't check
707 * whether tcp_dontdrop is set or not to
708 * protect ourselve from a SYN attack where a
709 * remote host can spoof itself as one of the
710 * good IP source and continue to hold
711 * resources too long.
713 MAKE_DROPPABLE(listener
, tcp
);
715 mutex_exit(&listener
->tcp_eager_lock
);
720 first_threshold
= tcp
->tcp_first_ctimer_threshold
;
721 second_threshold
= tcp
->tcp_second_ctimer_threshold
;
724 * If an app has set the second_threshold to 0, it means that
725 * we need to retransmit forever, unless this is a passive
726 * open. We need to set second_threshold back to a normal
727 * value such that later comparison with it still makes
728 * sense. But we set dont_timeout to B_TRUE so that we will
731 if (second_threshold
== 0) {
732 second_threshold
= tcps
->tcps_ip_abort_linterval
;
733 if (tcp
->tcp_active_open
)
734 dont_timeout
= B_TRUE
;
737 case TCPS_ESTABLISHED
:
738 case TCPS_CLOSE_WAIT
:
740 * If the end point has not been closed, TCP can retransmit
741 * forever. But if the end point is closed, the normal
744 if (second_threshold
== 0) {
745 second_threshold
= tcps
->tcps_ip_abort_linterval
;
746 dont_timeout
= B_TRUE
;
749 case TCPS_FIN_WAIT_1
:
752 /* If we have data to rexmit */
753 if (tcp
->tcp_suna
!= tcp
->tcp_snxt
) {
754 clock_t time_to_wait
;
756 TCPS_BUMP_MIB(tcps
, tcpTimRetrans
);
757 if (!tcp
->tcp_xmit_head
)
759 time_to_wait
= ddi_get_lbolt() -
760 (clock_t)tcp
->tcp_xmit_head
->b_prev
;
761 time_to_wait
= tcp
->tcp_rto
-
762 TICK_TO_MSEC(time_to_wait
);
764 * If the timer fires too early, 1 clock tick earlier,
767 if (time_to_wait
> msec_per_tick
) {
768 TCP_STAT(tcps
, tcp_timer_fire_early
);
769 TCP_TIMER_RESTART(tcp
, time_to_wait
);
773 * When we probe zero windows, we force the swnd open.
774 * If our peer acks with a closed window swnd will be
775 * set to zero by tcp_rput(). As long as we are
776 * receiving acks tcp_rput will
777 * reset 'tcp_ms_we_have_waited' so as not to trip the
778 * first and second interval actions. NOTE: the timer
779 * interval is allowed to continue its exponential
782 if (tcp
->tcp_swnd
== 0 || tcp
->tcp_zero_win_probe
) {
783 if (connp
->conn_debug
) {
784 (void) strlog(TCP_MOD_ID
, 0, 1,
785 SL_TRACE
, "tcp_timer: zero win");
789 * After retransmission, we need to do
790 * slow start. Set the ssthresh to one
791 * half of current effective window and
792 * cwnd to one MSS. Also reset
795 * Note that if tcp_ssthresh is reduced because
796 * of ECN, do not reduce it again unless it is
797 * already one window of data away (tcp_cwr
798 * should then be cleared) or this is a
799 * timeout for a retransmitted segment.
803 if (!tcp
->tcp_cwr
|| tcp
->tcp_rexmit
) {
804 npkt
= ((tcp
->tcp_timer_backoff
?
805 tcp
->tcp_cwnd_ssthresh
:
807 tcp
->tcp_suna
) >> 1) / tcp
->tcp_mss
;
808 tcp
->tcp_cwnd_ssthresh
= MAX(npkt
, 2) *
811 tcp
->tcp_cwnd
= tcp
->tcp_mss
;
812 tcp
->tcp_cwnd_cnt
= 0;
813 if (tcp
->tcp_ecn_ok
) {
814 tcp
->tcp_cwr
= B_TRUE
;
815 tcp
->tcp_cwr_snd_max
= tcp
->tcp_snxt
;
816 tcp
->tcp_ecn_cwr_sent
= B_FALSE
;
822 * We have something to send yet we cannot send. The
825 * 1. Zero send window: we need to do zero window probe.
826 * 2. Zero cwnd: because of ECN, we need to "clock out
828 * 3. SWS avoidance: receiver may have shrunk window,
829 * reset our knowledge.
831 * Note that condition 2 can happen with either 1 or
832 * 3. But 1 and 3 are exclusive.
834 if (tcp
->tcp_unsent
!= 0) {
836 * Should not hold the zero-copy messages for too long.
838 if (tcp
->tcp_snd_zcopy_aware
&& !tcp
->tcp_xmit_zc_clean
)
839 tcp
->tcp_xmit_head
= tcp_zcopy_backoff(tcp
,
840 tcp
->tcp_xmit_head
, B_TRUE
);
842 if (tcp
->tcp_cwnd
== 0) {
844 * Set tcp_cwnd to 1 MSS so that a
845 * new segment can be sent out. We
846 * are "clocking out" new data when
847 * the network is really congested.
849 ASSERT(tcp
->tcp_ecn_ok
);
850 tcp
->tcp_cwnd
= tcp
->tcp_mss
;
852 if (tcp
->tcp_swnd
== 0) {
853 /* Extend window for zero window probe */
855 tcp
->tcp_zero_win_probe
= B_TRUE
;
856 TCPS_BUMP_MIB(tcps
, tcpOutWinProbe
);
859 * Handle timeout from sender SWS avoidance.
860 * Reset our knowledge of the max send window
861 * since the receiver might have reduced its
862 * receive buffer. Avoid setting tcp_max_swnd
863 * to one since that will essentially disable
866 * Note that since we don't have a SWS
867 * state variable, if the timeout is set
868 * for ECN but not for SWS, this
869 * code will also be executed. This is
870 * fine as tcp_max_swnd is updated
871 * constantly and it will not affect
874 tcp
->tcp_max_swnd
= MAX(tcp
->tcp_swnd
, 2);
876 tcp_wput_data(tcp
, NULL
, B_FALSE
);
879 /* Is there a FIN that needs to be to re retransmitted? */
880 if ((tcp
->tcp_valid_bits
& TCP_FSS_VALID
) &&
883 /* Nothing to do, return without restarting timer. */
884 TCP_STAT(tcps
, tcp_timer_fire_miss
);
886 case TCPS_FIN_WAIT_2
:
888 * User closed the TCP endpoint and peer ACK'ed our FIN.
889 * We waited some time for for peer's FIN, but it hasn't
890 * arrived. We flush the connection now to avoid
891 * case where the peer has rebooted.
893 if (TCP_IS_DETACHED(tcp
)) {
894 (void) tcp_clean_death(tcp
, 0);
896 TCP_TIMER_RESTART(tcp
,
897 tcp
->tcp_fin_wait_2_flush_interval
);
901 (void) tcp_clean_death(tcp
, 0);
904 if (connp
->conn_debug
) {
905 (void) strlog(TCP_MOD_ID
, 0, 1, SL_TRACE
|SL_ERROR
,
906 "tcp_timer: strange state (%d) %s",
907 tcp
->tcp_state
, tcp_display(tcp
, NULL
,
914 * If the system is under memory pressure or the max number of
915 * connections have been established for the listener, be more
916 * aggressive in aborting connections.
918 if (tcps
->tcps_reclaim
|| (tcp
->tcp_listen_cnt
!= NULL
&&
919 tcp
->tcp_listen_cnt
->tlc_cnt
> tcp
->tcp_listen_cnt
->tlc_max
)) {
920 second_threshold
= tcp_early_abort
* SECONDS
;
922 /* We will ignore the never timeout promise in this case... */
923 dont_timeout
= B_FALSE
;
926 ASSERT(second_threshold
!= 0);
928 if ((ms
= tcp
->tcp_ms_we_have_waited
) > second_threshold
) {
930 * Should not hold the zero-copy messages for too long.
932 if (tcp
->tcp_snd_zcopy_aware
&& !tcp
->tcp_xmit_zc_clean
)
933 tcp
->tcp_xmit_head
= tcp_zcopy_backoff(tcp
,
934 tcp
->tcp_xmit_head
, B_TRUE
);
938 * Reset tcp_ms_we_have_waited to avoid overflow since
939 * we are going to retransmit forever.
941 tcp
->tcp_ms_we_have_waited
= second_threshold
;
946 * For zero window probe, we need to send indefinitely,
947 * unless we have not heard from the other side for some
950 if ((tcp
->tcp_zero_win_probe
== 0) ||
951 (TICK_TO_MSEC(ddi_get_lbolt() - tcp
->tcp_last_recv_time
) >
953 TCPS_BUMP_MIB(tcps
, tcpTimRetransDrop
);
955 * If TCP is in SYN_RCVD state, send back a
956 * RST|ACK as BSD does. Note that tcp_zero_win_probe
957 * should be zero in TCPS_SYN_RCVD state.
959 if (tcp
->tcp_state
== TCPS_SYN_RCVD
) {
960 tcp_xmit_ctl("tcp_timer: RST sent on timeout "
963 tcp
->tcp_rnxt
, TH_RST
| TH_ACK
);
965 (void) tcp_clean_death(tcp
,
966 tcp
->tcp_client_errno
?
967 tcp
->tcp_client_errno
: ETIMEDOUT
);
971 * If the system is under memory pressure, we also
972 * abort connection in zero window probing.
974 if (tcps
->tcps_reclaim
) {
975 (void) tcp_clean_death(tcp
,
976 tcp
->tcp_client_errno
?
977 tcp
->tcp_client_errno
: ETIMEDOUT
);
978 TCP_STAT(tcps
, tcp_zwin_mem_drop
);
982 * Set tcp_ms_we_have_waited to second_threshold
983 * so that in next timeout, we will do the above
984 * check (ddi_get_lbolt() - tcp_last_recv_time).
985 * This is also to avoid overflow.
987 * We don't need to decrement tcp_timer_backoff
988 * to avoid overflow because it will be decremented
989 * later if new timeout value is greater than
990 * tcp_rto_max. In the case when tcp_rto_max is
991 * greater than second_threshold, it means that we
992 * will wait longer than second_threshold to send
996 tcp
->tcp_ms_we_have_waited
= second_threshold
;
998 } else if (ms
> first_threshold
) {
1000 * Should not hold the zero-copy messages for too long.
1002 if (tcp
->tcp_snd_zcopy_aware
&& !tcp
->tcp_xmit_zc_clean
)
1003 tcp
->tcp_xmit_head
= tcp_zcopy_backoff(tcp
,
1004 tcp
->tcp_xmit_head
, B_TRUE
);
1007 * We have been retransmitting for too long... The RTT
1008 * we calculated is probably incorrect. Reinitialize it.
1009 * Need to compensate for 0 tcp_rtt_sa. Reset
1010 * tcp_rtt_update so that we won't accidentally cache a
1011 * bad value. But only do this if this is not a zero
1014 if (tcp
->tcp_rtt_sa
!= 0 && tcp
->tcp_zero_win_probe
== 0) {
1015 tcp
->tcp_rtt_sd
+= (tcp
->tcp_rtt_sa
>> 3) +
1016 (tcp
->tcp_rtt_sa
>> 5);
1017 tcp
->tcp_rtt_sa
= 0;
1019 tcp
->tcp_rtt_update
= 0;
1024 tcp
->tcp_timer_backoff
++;
1025 if ((ms
= (tcp
->tcp_rtt_sa
>> 3) + tcp
->tcp_rtt_sd
+
1026 tcps
->tcps_rexmit_interval_extra
+ (tcp
->tcp_rtt_sa
>> 5)) <
1029 * This means the original RTO is tcp_rexmit_interval_min.
1030 * So we will use tcp_rexmit_interval_min as the RTO value
1031 * and do the backoff.
1033 ms
= tcp
->tcp_rto_min
<< tcp
->tcp_timer_backoff
;
1035 ms
<<= tcp
->tcp_timer_backoff
;
1037 if (ms
> tcp
->tcp_rto_max
) {
1038 ms
= tcp
->tcp_rto_max
;
1040 * ms is at max, decrement tcp_timer_backoff to avoid
1043 tcp
->tcp_timer_backoff
--;
1045 tcp
->tcp_ms_we_have_waited
+= ms
;
1046 if (tcp
->tcp_zero_win_probe
== 0) {
1049 TCP_TIMER_RESTART(tcp
, ms
);
1051 * This is after a timeout and tcp_rto is backed off. Set
1052 * tcp_set_timer to 1 so that next time RTO is updated, we will
1053 * restart the timer with a correct value.
1055 tcp
->tcp_set_timer
= 1;
1056 mss
= tcp
->tcp_snxt
- tcp
->tcp_suna
;
1057 if (mss
> tcp
->tcp_mss
)
1059 if (mss
> tcp
->tcp_swnd
&& tcp
->tcp_swnd
!= 0)
1060 mss
= tcp
->tcp_swnd
;
1062 if ((mp
= tcp
->tcp_xmit_head
) != NULL
)
1063 mp
->b_prev
= (mblk_t
*)ddi_get_lbolt();
1064 mp
= tcp_xmit_mp(tcp
, mp
, mss
, NULL
, NULL
, tcp
->tcp_suna
, B_TRUE
, &mss
,
1068 * When slow start after retransmission begins, start with
1069 * this seq no. tcp_rexmit_max marks the end of special slow
1072 tcp
->tcp_rexmit_nxt
= tcp
->tcp_suna
;
1073 if ((tcp
->tcp_valid_bits
& TCP_FSS_VALID
) &&
1074 (tcp
->tcp_unsent
== 0)) {
1075 tcp
->tcp_rexmit_max
= tcp
->tcp_fss
;
1077 tcp
->tcp_rexmit_max
= tcp
->tcp_snxt
;
1079 tcp
->tcp_rexmit
= B_TRUE
;
1080 tcp
->tcp_dupack_cnt
= 0;
1083 * Remove all rexmit SACK blk to start from fresh.
1085 if (tcp
->tcp_snd_sack_ok
)
1086 TCP_NOTSACK_REMOVE_ALL(tcp
->tcp_notsack_list
, tcp
);
1091 tcp
->tcp_csuna
= tcp
->tcp_snxt
;
1092 TCPS_BUMP_MIB(tcps
, tcpRetransSegs
);
1093 TCPS_UPDATE_MIB(tcps
, tcpRetransBytes
, mss
);
1094 tcp_send_data(tcp
, mp
);
1099 * Handle lingering timeouts. This function is called when the SO_LINGER timeout
1103 tcp_close_linger_timeout(void *arg
)
1105 conn_t
*connp
= (conn_t
*)arg
;
1106 tcp_t
*tcp
= connp
->conn_tcp
;
1108 tcp
->tcp_client_errno
= ETIMEDOUT
;
1109 tcp_stop_lingering(tcp
);